
    i                        d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(  e jR                  e*      Z+e ed       G d de                    Z,e ed       G d de                    Z- G d dej\                        Z/ G d dej\                        Z0 G d d ej\                        Z1	 	 dWd!ej\                  d"ejd                  d#ejd                  d$ejd                  d%ejd                  dz  d&e3dz  d'e3d(ee   fd)Z4 G d* d+ej\                        Z5 G d, d-ej\                        Z6 G d. d/ej\                        Z7 G d0 d1ej\                        Z8 G d2 d3ej\                        Z9 G d4 d5e      Z: G d6 d7ej\                        Z;d8 Z< G d9 d:ej\                        Z= G d; d<ej\                        Z> G d= d>ej\                        Z? G d? d@ej\                        Z@e G dA dBe             ZA G dC dDej\                        ZBe G dE dFeA             ZC G dG dHej\                        ZD G dI dJej\                        ZE G dK dLej\                        ZF edM       G dN dOeA             ZG G dP dQej\                        ZH G dR dSej\                        ZIe G dT dUeA             ZJg dVZKy)XzPyTorch DPT (Dense Prediction Transformers) model.

This implementation is heavily inspired by OpenMMLab's implementation, found here:
https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/dpt_head.py.

    N)Callable)	dataclass)nn)CrossEntropyLoss   )initialization)ACT2FN)load_backbone)GradientCheckpointingLayer)BaseModelOutputDepthEstimatorOutputSemanticSegmenterOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )	DPTConfigz
    Base class for model's outputs that also contains intermediate activations that can be used at later stages. Useful
    in the context of Vision models.:
    )custom_introc                   l    e Zd ZU dZdZej                  dz  ed<   dZe	ej                  df   dz  ed<   y)*BaseModelOutputWithIntermediateActivationsak  
    last_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
        Intermediate activations that can be used to compute hidden states of the model at various layers.
    Nlast_hidden_states.intermediate_activations)
__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r    tuple     u/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/dpt/modeling_dpt.pyr   r   -   s?     48))D07EIeE$5$5s$:;dBIr*   r   z
    Base class for model's outputs that also contains a pooling of the last hidden states as well as intermediate
    activations that can be used by the model at later stages.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)	4BaseModelOutputWithPoolingAndIntermediateActivationsa  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Last layer hidden-state of the first token of the sequence (classification token) after further processing
        through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
        the classification token after processing through a linear layer and a tanh activation function. The linear
        layer weights are trained from the next sentence prediction (classification) objective during pretraining.
    intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
        Intermediate activations that can be used to compute hidden states of the model at various layers.
    Nlast_hidden_statepooler_output.hidden_states
attentionsr    )r!   r"   r#   r$   r.   r%   r&   r'   r/   r0   r(   r1   r    r)   r*   r+   r-   r-   @   s     37u((4/6.2M5$$t+2:>M5**C/047>7;Je'',-4;EIeE$5$5s$:;dBIr*   r-   c                   r     e Zd ZdZddedeeef   dz  f fdZddZ	 dde	j                  ded	efd
Z xZS )DPTViTHybridEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    Nconfigfeature_sizec                 b   t         
|           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }t        |      | _        | j                  j                  d   }t        | j                  j                        dk7  r+t        dt        | j                  j                               ddg| _        ||j                   }	|	dd  }|	d   }nCt        |t        j                  j                        r|n||f}| j                  j                  d   }|| _        |d   | _        || _        t#        j$                  ||d      | _        t#        j(                  t+        j,                  dd|j
                              | _        t#        j(                  t+        j,                  d|dz   |j
                              | _        y )Nr   r   r   z1Expected backbone to have 3 output features, got kernel_size)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterabler
   backbonechannelslen
ValueErrorresidual_feature_map_indexbackbone_featmap_shaper   Conv2d
projection	Parameterr%   zeros	cls_tokenposition_embeddings)selfr4   r5   r=   r>   r?   r@   num_patchesfeature_dimfeat_map_shape	__class__s             r+   r<   zDPTViTHybridEmbeddings.__init__`   s   !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY%f-mm,,R0t}}%%&!+PQTUYUbUbUkUkQlPmnoo+,a&'#::N)"#.L(+K !+<9Q9Q RYegsXt  --004K$$Q-())K!Lekk!Q8J8J&KL#%<<A{QPVPbPb0c#d r*   c                 r   |d d d |f   }|d|d f   }t        t        |      dz        }|j                  d||d      j                  dddd      }t        j
                  j                  |||fd      }|j                  dddd      j                  d||z  d      }t        j                  ||gd	      }|S 
Nr         ?r   r7   r      bilinear)sizemodedim)	r   rG   reshapepermuter   
functionalinterpolater%   catrQ   posembgrid_size_heightgrid_size_widthstart_index
posemb_tokposemb_gridold_grid_sizes           r+   _resize_pos_embedz(DPTViTHybridEmbeddings._resize_pos_embed   s    A||O,
Q_-!#k"2c"9:!))!]M2NVVWXZ[]^`abmm//CSUdBelv/w!))!Q15==aAQTcAceghJ4!<r*   pixel_valuesinterpolate_pos_encodingreturnc                    |j                   \  }}}}|| j                  k7  rt        d      |sV|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j	                  | j
                  || j                  z  || j                  z        }| j                  |      }|j                  d   }	| j                  D 
cg c]  }
|j                  |
    }}
| j                  |	      j                  d	      j                  dd	      }| j                  j                  |dd      }t        j                   ||fd
      }||z   }t#        ||      S c c}
w )NeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).r7   rY   r]   )r   r    )shaper?   rH   r=   rl   rP   r>   rE   feature_mapsrI   rL   flatten	transposerO   expandr%   rc   r   )rQ   rm   rn   
batch_sizer?   heightwidthrP   backbone_outputfeaturesindexoutput_hidden_states
embeddings
cls_tokenss                 r+   forwardzDPTViTHybridEmbeddings.forward   s    3?2D2D/
L&%4,,,w  (++u8J/J (% 9+,Adooa.@-AE 
 #44$$f&?$//AY
 --5"//3 RVQpQpq < <U Cqq__X.66q9CCAqI
^^**:r2>
YY
J7Q?
  "55
 :)%9
 	
  rs   )E9Nr   F)r!   r"   r#   r$   r   r(   intr<   rl   r%   Tensorboolr   r   __classcell__rU   s   @r+   r3   r3   Y   sY     ey  ec3h$8N  eD LQ&
!LL&
DH&
	3&
r*   r3   c                   N     e Zd ZdZ fdZddZdej                  defdZ	 xZ
S )DPTViTEmbeddingszB
    Construct the CLS token, position and patch embeddings.

    c                    t         |           t        j                  t	        j
                  dd|j                              | _        t        |      | _	        | j                  j                  }t        j                  t	        j
                  d|dz   |j                              | _        t        j                  |j                        | _        || _        y )Nr   )r;   r<   r   rM   r%   rN   r@   rO   DPTViTPatchEmbeddingspatch_embeddingsrR   rP   Dropouthidden_dropout_probdropoutr4   )rQ   r4   rR   rU   s      r+   r<   zDPTViTEmbeddings.__init__   s    ekk!Q8J8J&KL 5f =++77#%<<A{QPVPbPb0c#d zz&"<"<=r*   c                 ~   |d d d |f   }|d|d f   }t        |j                  d      dz        }|j                  d||d      j                  dddd      }t        j
                  j                  |||fd      }|j                  dddd      j                  d||z  d      }t        j                  ||gd	      }|S rW   )	r   r[   r_   r`   r   ra   rb   r%   rc   rd   s           r+   rl   z"DPTViTEmbeddings._resize_pos_embed   s    A||O,
Q_-!+"2"21"5"<=!))!]M2NVVWXZ[]^`abmm//CSUdBelv/w!))!Q15==aAQTcAceghJ4!<r*   rm   ro   c                    |j                   \  }}}}| j                  j                  }| j                  | j                  ||z  ||z        }| j                  |      }|j                         \  }}	}
| j                  j                  |dd      }t        j                  ||fd      }||z   }| j                  |      }t        |      S )Nr7   r   r]   )r   )rs   r4   r>   rl   rP   r   r[   rO   rw   r%   rc   r   r   )rQ   rm   rx   r?   ry   rz   r>   rP   r   seq_len_r   s               r+   r   zDPTViTEmbeddings.forward   s    2>2D2D/
L&% [[++
"44$$f
&:EZ<O
 **<8
!+!2
GQ ^^**:r2>
YY
J7Q?
  "55
\\*-
9ZXXr*   r   )r!   r"   r#   r$   r<   rl   r%   r   r   r   r   r   s   @r+   r   r      s-    
YELL Y5_ Yr*   r   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )r   z$
    Image to Patch Embedding.

    r4   c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        t        j                  ||||      | _        y )Nr   r   )r:   stride)r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rR   r   rK   rL   )rQ   r4   r=   r>   r?   r@   rR   rU   s          r+   r<   zDPTViTPatchEmbeddings.__init__   s    !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&))L+:^hir*   rm   ro   c                     |j                   \  }}}}|| j                  k7  rt        d      | j                  |      j	                  d      j                  dd      }|S )Nrq   rY   r   )rs   r?   rH   rL   ru   rv   )rQ   rm   rx   r?   ry   rz   r   s          r+   r   zDPTViTPatchEmbeddings.forward  sb    2>2D2D/
L&%4,,,w  __\2::1=GG1M
r*   
r!   r"   r#   r$   r   r<   r%   r   r   r   r   s   @r+   r   r      s1    
jy jELL U\\ r*   r   modulequerykeyvalueattention_maskscalingr   kwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr7         rY   r   r]   )ptrainingr   )
r[   r%   matmulrv   r   ra   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputs
             r+   eager_attention_forwardr     s     **R.D( <<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r*   c                        e Zd Zdef fdZdej                  dee   de	ej                  ej                  f   fdZ
 xZS )DPTSelfAttentionr4   c                 2   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        |j                  | _        | j                  dz  | _        d| _        t        j                  |j                  | j                  |j                         | _        t        j                  |j                  | j                  |j                         | _        t        j                  |j                  | j                  |j                         | _        y )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .r   F)bias)r;   r<   r@   num_attention_headshasattrrH   r4   r   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr   r   r   rQ   r4   rU   s     r+   r<   zDPTSelfAttention.__init__.  sF    : ::a?PVXhHi"6#5#5"6 7334A7 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r*   r0   r   ro   c                    |j                   d   }|d| j                  | j                  f} | j                  |      j                  | j                  dd      } | j                  |      j                  | j                  dd      } | j                  |      j                  | j                  dd      }t        j                  | j                  j                  t              } || |||d f| j                  | j                  | j                  sdn| j                   d|\  }	}
|	j#                         d d | j$                  fz   }|	j'                  |      }	|	|
fS )Nr   r7   r   rY           )r   r   r   r8   )rs   r   r   r   viewrv   r   r   r   get_interfacer4   _attn_implementationr   r   r   r   r   r[   r   r_   )rQ   r0   r   rx   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapes               r+   r   zDPTSelfAttention.forwardB  sY   
 #((+
D$<$<d>V>VV	0DHH]+00)<FFq!L	4djj/44i@JJ1aP4djj/44i@JJ1aP(?(M(MKK,,.E)
 *=
*
 nnLL#}}C$2C2C
*
 
*
& #0"4"4"6s";t?Q?Q>S"S%--.EFo--r*   )r!   r"   r#   r   r<   r%   r   r   r   r(   r   r   r   s   @r+   r   r   -  sN    ]y ](.||. +,. 
u||U\\)	*	.r*   r   c                   x     e Zd ZdZdef fdZdej                  dej                  dej                  fdZ xZ	S )DPTViTSelfOutputz
    The residual connection is defined in ViTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r4   c                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y r   )	r;   r<   r   r   r@   denser   r   r   r   s     r+   r<   zDPTViTSelfOutput.__init__k  sB    YYv1163E3EF
zz&"<"<=r*   r0   input_tensorro   c                 J    | j                  |      }| j                  |      }|S r   r   r   rQ   r0   r   s      r+   r   zDPTViTSelfOutput.forwardp  s$    

=1]3r*   r   r   s   @r+   r   r   e  s=    
>y >
U\\  RWR^R^ r*   r   c                   f     e Zd Zdef fdZdej                  dee   dej                  fdZ	 xZ
S )DPTViTAttentionr4   c                 b    t         |           t        |      | _        t	        |      | _        y r   )r;   r<   r   	attentionr   outputr   s     r+   r<   zDPTViTAttention.__init__x  s&    )&1&v.r*   r0   r   ro   c                 V     | j                   |fi |\  }}| j                  ||      }|S r   )r   r   )rQ   r0   r   self_attn_outputr   r   s         r+   r   zDPTViTAttention.forward}  s5    
 -dnn]EfE!-}=r*   )r!   r"   r#   r   r<   r%   r   r   r   r   r   r   s   @r+   r   r   w  s>    /y /
|| +, 
	r*   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )DPTViTIntermediater4   c                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r;   r<   r   r   r@   intermediate_sizer   rA   
hidden_actstrr	   intermediate_act_fnr   s     r+   r<   zDPTViTIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r*   r0   ro   c                 J    | j                  |      }| j                  |      }|S r   )r   r   )rQ   r0   s     r+   r   zDPTViTIntermediate.forward  s&    

=100?r*   	r!   r"   r#   r   r<   r%   r   r   r   r   s   @r+   r   r     s*    9y 9U\\ ell r*   r   c                   t     e Zd Zdef fdZdej                  dej                  dej                  fdZ xZS )DPTViTOutputr4   c                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r   )
r;   r<   r   r   r   r@   r   r   r   r   r   s     r+   r<   zDPTViTOutput.__init__  sB    YYv779K9KL
zz&"<"<=r*   r0   r   ro   c                 T    | j                  |      }| j                  |      }||z   }|S r   r   r   s      r+   r   zDPTViTOutput.forward  s.    

=1]3%4r*   r   r   s   @r+   r   r     s8    >y >
U\\  RWR^R^ r*   r   c                   j     e Zd ZdZdef fdZdej                  dee	   dej                  fdZ
 xZS )DPTViTLayerz?This corresponds to the Block class in the timm implementation.r4   c                 r   t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y )Nr   eps)r;   r<   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr@   layer_norm_epslayernorm_beforelayernorm_afterr   s     r+   r<   zDPTViTLayer.__init__  s    '-'E'E$(0.v6"6* "V-?-?VEZEZ [!||F,>,>FDYDYZr*   r0   r   ro   c                     | j                  |      } | j                  |fi |}||z   }| j                  |      }| j                  |      }| j	                  ||      }|S r   )r   r   r   r   r   )rQ   r0   r   hidden_states_normattention_outputlayer_outputs         r+   r   zDPTViTLayer.forward  sr    
 "22=A)4>>*<GG )=8 ++M:((6 {{<?r*   )r!   r"   r#   r$   r   r<   r%   r   r   r   r   r   r   s   @r+   r   r     sC    I[y [|| +, 
	r*   r   c                   t     e Zd ZdZ fdZd Zd Zddeej                     deej                     fdZ
 xZS )	DPTReassembleStagea@  
    This class reassembles the hidden states of the backbone into image-like feature representations at various
    resolutions.

    This happens in 3 stages:
    1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
       `config.readout_type`.
    2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
    3. Resizing the spatial dimensions (height, width).

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    c                     t         |           || _        t        j                         | _        |j                  r| j                  |       n| j                  |       |j                  | _	        y r   )
r;   r<   r4   r   
ModuleListlayers	is_hybrid_init_reassemble_dpt_hybrid_init_reassemble_dptneck_ignore_stagesr   s     r+   r<   zDPTReassembleStage.__init__  sU    mmo,,V4%%f-"(";";r*   c           	      v   t        t        t        |j                              |j                        D ]r  \  }}|dk  r.| j
                  j                  t        j                                9|dkD  s?| j
                  j                  t        ||j                  |   |             t |j                  dk7  rt        d|j                   d      t        j                         | _        t        |      }t        t        |j                              D ]  }|dk  rA| j                  j                  t        j                  t        j                                      I|dkD  sO| j                  j                  t        j                  t        j                   d|z  |      t"        |j$                                   y)a   "
        For DPT-Hybrid the first 2 reassemble layers are set to `nn.Identity()`, please check the official
        implementation: https://github.com/isl-org/DPT/blob/f43ef9e08d70a752195028a51be5e1aff227b913/dpt/vit.py#L438
        for more details.
        r   rF   factorprojectzReadout type z! is not supported for DPT-Hybrid.rY   N)ziprangerG   neck_hidden_sizesreassemble_factorsr   appendr   IdentityDPTReassembleLayerreadout_typerH   r   readout_projects_get_backbone_hidden_size
Sequentialr   r	   r   )rQ   r4   ir   r@   s        r+   r   z.DPTReassembleStage._init_reassemble_dpt_hybrid  sX    U3v'?'?#@A6C\C\] 	tIAvAv""2;;=1Q""#5fvG_G_`aGbkq#rs		t )+}V-@-@,AAbcdd !#/7s63345 	AAv%%,,R]]2;;=-IJQ%%,,MM"))AO["I6RXRcRcKde		r*   c           	      <   t        t        t        |j                              |j                        D ]9  \  }}| j
                  j                  t        ||j                  |   |             ; |j                  dk(  rt        j                         | _        t        |      }t        t        |j                              D ]Y  }| j                  j                  t        j                  t        j                  d|z  |      t        |j                                   [ y y )Nr   r  rY   )r  r  rG   r  r  r   r  r  r	  r   r   r
  r  r  r   r	   r   )rQ   r4   r  r   r@   r   s         r+   r   z'DPTReassembleStage._init_reassemble_dpt  s    U3v'?'?#@A6C\C\] 	pIAvKK1&6C[C[\]C^gmno	p )+$&MMOD!3F;K3v7789 %%,,MM"))AO["I6RXRcRcKde ,r*   r0   ro   c                    g }t        |      D ]  \  }}|| j                  vr|dddf   |ddddf   }}|j                  \  }}	}
|||j                  ||||
      }n"t	        |	dz        }|j                  ||||
      }|j                  dddd      j                         }|j                  }| j                  j                  dk(  r|j                  d      j                  d      }|j                  d      j                  |      } | j                  |   t        j                  ||fd	            }|j                  ddd      j                  |      }nM| j                  j                  d
k(  r4|j                  d      |j                  d	      z   }|j                  |      } | j                  |   |      }|j!                  |        |S )z
        Args:
            hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
                List of hidden states from the backbone.
        Nr   r   rX   r   rY   r  )r   rY   r   r7   add)	enumerater   rs   r_   r   r`   r   r4   r	  ru   	unsqueeze	expand_asr
  r%   rc   r   r  )rQ   r0   patch_heightpatch_widthoutr  hidden_staterO   rx   sequence_lengthr?   r[   feature_shapereadouts                 r+   r   zDPTReassembleStage.forward
  s    (7 	%OA|///*6q!t*<l1ab5>Q<	<H<N<N9
O\+0G#/#7#7
LR]_k#lL$_c%9:D#/#7#7
D$P\#]L+33Aq!Q?JJL , 2 2;;++y8#/#7#7#:#B#B9#ML'11!4>>|LG#;4#8#8#;EII|U\F]_a<b#cL#/#7#71a#@#H#H#WL[[--6#/#7#7#:Y=P=PQS=T#TL#/#7#7#FL-t{{1~l;JJ|$3	%6 
r*   NN)r!   r"   r#   r$   r<   r   r   listr%   r   r   r   r   s   @r+   r   r     s@    
<4
#T%,,%7 #aefkfrfras #r*   r   c                     | j                   ,t        | j                   d      r| j                   j                  S | j                  S )Nr@   )backbone_configr   r@   )r4   s    r+   r  r  0  s;    )gf6L6Lm.\%%111!!!r*   c                   2     e Zd Zdededef fdZd Z xZS )r  r4   rF   r   c           	      \   t         |           t        |      }t        j                  ||d      | _        |dkD  r t        j                  ||||d      | _        y |dk(  rt        j                         | _        y |dk  r,t        j                  ||dt        d|z        d      | _        y y )Nr   )in_channelsout_channelsr:   r   r:   r   paddingr   )
r;   r<   r  r   rK   rL   ConvTranspose2dresizer  r   )rQ   r4   rF   r   r@   rU   s        r+   r<   zDPTReassembleLayer.__init__8  s    /7))(`ab A:,,XxV\blmnDKq[++-DKaZ))HhAcRSV\R\oghiDK r*   c                 J    | j                  |      }| j                  |      }|S r   )rL   r&  )rQ   r  s     r+   r   zDPTReassembleLayer.forwardG  s$    |4{{<0r*   )r!   r"   r#   r   r   r<   r   r   r   s   @r+   r  r  7  s&    jy jC j jr*   r  c                   *     e Zd Zdef fdZd Z xZS )DPTFeatureFusionStager4   c                     t         |           t        j                         | _        t        t        |j                              D ]&  }| j                  j                  t        |             ( y r   )
r;   r<   r   r   r   r  rG   r  r  DPTFeatureFusionLayerrQ   r4   r   rU   s      r+   r<   zDPTFeatureFusionStage.__init__N  sR    mmos63345 	>AKK4V<=	>r*   c                     |d d d   }g }d }t        || j                        D ]*  \  }}|	 ||      }n	 |||      }|j                  |       , |S )Nr7   )r  r   r  )rQ   r0   fused_hidden_statesfused_hidden_stater  layers         r+   r   zDPTFeatureFusionStage.forwardT  sq    %dd+ !#&}dkk#B 	;L%!)%*<%8"%*+=|%L"&&'9:	; #"r*   )r!   r"   r#   r   r<   r   r   r   s   @r+   r)  r)  M  s    >y >#r*   r)  c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )DPTPreActResidualLayerz
    ResidualConvUnit, pre-activate residual unit.

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    r4   c                 l   t         |           |j                  | _        |j                  |j                  n| j                   }t        j                         | _        t        j                  |j                  |j                  ddd|      | _
        t        j                         | _        t        j                  |j                  |j                  ddd|      | _        | j                  rIt        j                  |j                        | _        t        j                  |j                        | _        y y )Nr   r   )r:   r   r$  r   )r;   r<   !use_batch_norm_in_fusion_residualuse_batch_normuse_bias_in_fusion_residualr   ReLUactivation1rK   fusion_hidden_sizeconvolution1activation2convolution2BatchNorm2dbatch_norm1batch_norm2)rQ   r4   r6  rU   s      r+   r<   zDPTPreActResidualLayer.__init__n  s   $FF 11= ..((( 	$ 779II%%%%,
 779II%%%%,
 !~~f.G.GHD!~~f.G.GHD r*   r  ro   c                    |}| j                  |      }| j                  |      }| j                  r| j                  |      }| j	                  |      }| j                  |      }| j                  r| j                  |      }||z   S r   )r8  r:  r5  r>  r;  r<  r?  rQ   r  residuals      r+   r   zDPTPreActResidualLayer.forward  s    ''5((6++L9L''5((6++L9Lh&&r*   r   r   s   @r+   r2  r2  e  s2     Iy  ID'ELL 'U\\ 'r*   r2  c                        e Zd ZdZd
dedef fdZddej                  dej                  dz  dej                  fd	Z	 xZ
S )r+  a3  Feature fusion layer, merges feature maps from different stages.

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
        align_corners (`bool`, *optional*, defaults to `True`):
            The align_corner setting for bilinear upsample.
    r4   align_cornersc                     t         |           || _        t        j                  |j
                  |j
                  dd      | _        t        |      | _        t        |      | _	        y )Nr   T)r:   r   )
r;   r<   rD  r   rK   r9  rL   r2  residual_layer1residual_layer2)rQ   r4   rD  rU   s      r+   r<   zDPTFeatureFusionLayer.__init__  sT    *))F$=$=v?X?Xfgnrs5f=5f=r*   Nr  rB  ro   c                    |l|j                   |j                   k7  r?t        j                  j                  ||j                   d   |j                   d   fdd      }|| j	                  |      z   }| j                  |      }t        j                  j                  |dd| j                        }| j                  |      }|S )NrY   r   rZ   Fr[   r\   rD  scale_factorr\   rD  )rs   r   ra   rb   rF  rG  rD  rL   rA  s      r+   r   zDPTFeatureFusionLayer.forward  s    !!X^^3==44L$6$6q$9<;M;Ma;P#QXbrw 5  ($*>*>x*HHL++L9}}00qzI[I[ 1 
 |4r*   Tr   )r!   r"   r#   r$   r   r   r<   r%   r   r   r   r   s   @r+   r+  r+    sI    >y > >ELL ELL4<O [`[g[g r*   r+  c                   |     e Zd ZU eed<   dZdZdZdZdZ	dZ
dZdZeedZ ej"                          fd       Z xZS )DPTPreTrainedModelr4   dptrm   )imageT)r0   r1   c                     t         |   |       t        |t        t        f      r?t        j                  |j                         t        j                  |j                         yy)zInitialize the weightsN)	r;   _init_weightsrA   r   r3   initzeros_rO   rP   )rQ   r   rU   s     r+   rR  z DPTPreTrainedModel._init_weights  sM     	f%f/1GHIKK(()KK223 Jr*   )r!   r"   r#   r   r'   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr%   no_gradrR  r   r   s   @r+   rN  rN    sa    $O!&*#N"&$&
 U]]_4 4r*   rN  c            	       Z     e Zd Zdef fdZ	 ddej                  dedee	   de
fdZ xZS )	DPTViTEncoderr4   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        y c c}w r   )	r;   r<   r4   r   r   r  num_hidden_layersr   r0  r,  s      r+   r<   zDPTViTEncoder.__init__  sC    ]]vG_G_A`#aAK$7#ab
#as   Ar0   r~   r   ro   c                 L    | j                   D ]
  } ||      } t        |      S )N)r.   )r0  r   )rQ   r0   r~   r   layer_modules        r+   r   zDPTViTEncoder.forward  s.     !JJ 	8L(7M	8 ??r*   r   )r!   r"   r#   r   r<   r%   r   r   r   r   r   r   r   r   s   @r+   r`  r`    sL    cy c IN@"\\@AE@Y_`rYs@	@r*   r`  c            	            e Zd Zddedef fdZd Ze ed      e	de
j                  dee   d	efd
                     Z xZS )DPTModelr4   add_pooling_layerc                 T   t         |   |       || _        |j                  rt	        |      | _        nt        |      | _        t        |      | _        t        j                  |j                  |j                        | _        |rt        |      nd| _        | j!                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   N)r;   r<   r4   r   r3   r   r   r`  encoderr   r   r@   r   	layernormDPTViTPoolerpooler	post_init)rQ   r4   rg  rU   s      r+   r<   zDPTModel.__init__  s    
 	  4V<DO.v6DO$V,f&8&8f>S>ST.?l6*T 	r*   c                 r    | j                   j                  r| j                  S | j                  j                  S r   )r4   r   r   r   )rQ   s    r+   get_input_embeddingszDPTModel.get_input_embeddings  s)    ;;  ??"??333r*   F)tie_last_hidden_statesrm   r   ro   c                 
   | j                  |      }|j                  } | j                  |fi |}|j                  }| j	                  |      }| j
                  | j                  |      nd }t        |||j                        S )N)r.   r/   r    )r   r   ri  r.   rj  rl  r-   r    )rQ   rm   r   embedding_outputembedding_last_hidden_statesencoder_outputssequence_outputpooled_outputs           r+   r   zDPTModel.forward  s     HLWcGd'7'J'J$+74<<8T+_X^+_);;..98<8OO4UYC-'%5%N%N
 	
r*   rL  )r!   r"   r#   r   r   r<   ro  r   r   r   r%   r&   r   r   r-   r   r   r   s   @r+   rf  rf    sl    y T *4  E2
''
 +,
 
>	
  3  
r*   rf  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )rk  r4   c                     t         |           t        j                  |j                  |j
                        | _        t        |j                     | _	        y r   )
r;   r<   r   r   r@   pooler_output_sizer   r	   
pooler_act
activationr   s     r+   r<   zDPTViTPooler.__init__&  s>    YYv1163L3LM
 !2!23r*   r0   ro   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r{  )rQ   r0   first_token_tensorrv  s       r+   r   zDPTViTPooler.forward+  s6     +1a40

#566r*   r   r   s   @r+   rk  rk  %  s*    4y 4
U\\ ell r*   rk  c            
            e Zd ZdZdef fdZ	 	 d
deej                     de	dz  de	dz  deej                     fd	Z
 xZS )DPTNecka;  
    DPTNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
    input and produces another list of tensors as output. For DPT, it includes 2 stages:

    * DPTReassembleStage
    * DPTFeatureFusionStage.

    Args:
        config (dict): config dict.
    r4   c           
         t         |           || _        |j                  !|j                  j                  dk(  rd | _        nt        |      | _        t        j                         | _	        |j                  D ]?  }| j                  j                  t        j                  ||j                  ddd             A t        |      | _        y )Nswinv2r   r   Fr:   r$  r   )r;   r<   r4   r  
model_typereassemble_stager   r   r   convsr  r  rK   r9  r)  fusion_stage)rQ   r4   channelrU   s      r+   r<   zDPTNeck.__init__@  s     !!-&2H2H2S2SW_2_$(D!$6v$>D!]]_
// 	sGJJbii1J1JXYcdkpqr	s 2&9r*   Nr0   r  r  ro   c                    t        |t        t        f      st        d      t	        |      t	        | j
                  j                        k7  rt        d      | j                  | j                  |||      }t        |      D cg c]  \  }} | j                  |   |       }}}| j                  |      }|S c c}}w )z
        Args:
            hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
                List of hidden states from the backbone.
        z2hidden_states should be a tuple or list of tensorszOThe number of hidden states should be equal to the number of neck hidden sizes.)rA   r(   r  	TypeErrorrG   r4   r  rH   r  r  r  r  )rQ   r0   r  r  r  featurer|   r   s           r+   r   zDPTNeck.forwardQ  s     -%7PQQ}T[[%B%B!CCnoo   , 11-{[M=F}=UVzq'MDJJqM'*VV ""8, Ws   B:r  )r!   r"   r#   r$   r   r<   r  r%   r   r   r   r   r   s   @r+   r  r  4  sa    	:y :( $("&	ELL) Dj 4Z	
 
ell	r*   r  c                   f     e Zd ZdZdef fdZdeej                     dej                  fdZ	 xZ
S )DPTDepthEstimationHeada	  
    Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
    the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
    supplementary material).
    r4   c                    t         |           || _        d | _        |j                  rt        j                  ddddd      | _        |j                  }t        j                  t        j                  ||dz  ddd      t        j                  ddd	
      t        j                  |dz  dddd      t        j                         t        j                  ddddd      t        j                               | _        y )N   )r   r   )r   r   r#  rY   r   r   rZ   TrJ      r   )r;   r<   r4   rL   add_projectionr   rK   r9  r  Upsampler7  headrQ   r4   r|   rU   s      r+   r<   zDPTDepthEstimationHead.__init__u  s       iiSfV]cdDO,,MMIIhA1QPQRKKQZtLIIh!mRQq!LGGIIIb!1a@GGI
	r*   r0   ro   c                     || j                   j                     }| j                  +| j                  |      } t        j                         |      }| j                  |      }|j                  d      }|S )Nr   r]   )r4   head_in_indexrL   r   r7  r  squeeze)rQ   r0   predicted_depths      r+   r   zDPTDepthEstimationHead.forward  sg    %dkk&?&?@??& OOM:M%BGGIm4M))M2)11a18r*   )r!   r"   r#   r$   r   r<   r  r%   r   r   r   r   s   @r+   r  r  n  s4    
y 
&T%,,%7 ELL r*   r  zu
    DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
    c                        e Zd Z fdZee	 ddej                  dej                  dz  de	e
   defd              Z xZS )	DPTForDepthEstimationc                    t         |   |       d | _        |j                  du r|j                  t        |      | _        nt        |d      | _        t        |      | _	        t        |      | _        | j                          y NF)rg  )r;   r<   rE   r   r  r
   rf  rO  r  neckr  r  rm  r   s     r+   r<   zDPTForDepthEstimation.__init__  sq     u$)?)?)K)&1DM%@DH FO	 +62	 	r*   Nrm   labelsr   ro   c                     d}|t        d      |j                  d      xs t         j                  dd      }d|d<    j                  *  j                  j
                  |fi |}|j                  }n  j                  |fi |}|j                  } j                  j                  s:t        |dd       D 	cg c]   \  }}	| j                  j                  v s|	" }}}	n4|j                  }
|
j                   fdt        |dd       D               |
}d\  }} j                  j                  S j                  j                  du r;|j                  \  }}}} j                  j                  j                   }||z  }||z  } j#                  |||      } j%                  |      }t'        |||r|j                  nd|j(                  	      S c c}	}w )
a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth depth estimation maps for computing the loss.

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DPTForDepthEstimation
        >>> import torch
        >>> import numpy as np
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large")
        >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # interpolate to original size
        >>> post_processed_output = image_processor.post_process_depth_estimation(
        ...     outputs,
        ...     target_sizes=[(image.height, image.width)],
        ... )

        >>> # visualize the prediction
        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
        >>> depth = predicted_depth * 255 / predicted_depth.max()
        >>> depth = depth.detach().cpu().numpy()
        >>> depth = Image.fromarray(depth.astype("uint8"))
        ```NzTraining is not implemented yetr~   FTr   c              3   ^   K   | ]$  \  }}|j                   j                  d d v r| & ywrY   Nr4   backbone_out_indices.0idxr  rQ   s      r+   	<genexpr>z0DPTForDepthEstimation.forward.<locals>.<genexpr>  s6      .$Wdkk>>qrBB .s   *-r  )lossr  r0   r1   )NotImplementedErrorgetgetattrr4   rE   forward_with_filtered_kwargsrt   rO  r0   r   r  r  r    extendr  rs   r>   r  r  r   r1   )rQ   rm   r  r   r  user_requested_hidden_statesoutputsr0   r  r  backbone_hidden_statesr  r  r   ry   rz   r>   r  s   `                 r+   r   zDPTForDepthEstimation.forward  s   \ %&GHH (.zz2H'I (
WKK/N
$ *.%&==$@dmm@@XQWXG#00Mdhh|6v6G#11M ;;((09-:K0L! ,WPSW[WbWbWwWwPwG! ! *1)I)I&&-- .(1-2C(D. 
 !7$.!k;;&&2t{{7L7LPU7U"."4"4Aq&%44??J!Z/L:-K		-{K))M2#+3O'//UY))	
 	
-!s   < GGr   )r!   r"   r#   r<   r   r   r%   r&   
LongTensorr   r   r   r   r   r   s   @r+   r  r    sl    $  +/[
''[
   4'[
 +,	[

 
[
  [
r*   r  c                   b     e Zd Zdef fdZdeej                     dej                  fdZ xZ	S )DPTSemanticSegmentationHeadr4   c                    t         |           || _        |j                  }t	        j
                  t	        j                  ||ddd      t	        j                  |      t	        j                         t	        j                  |j                        t	        j                  ||j                  d      t	        j                  ddd	            | _        y )
Nr   r   Fr  r9   rY   rZ   TrJ  )r;   r<   r4   r9  r   r  rK   r=  r7  r   semantic_classifier_dropout
num_labelsr  r  r  s      r+   r<   z$DPTSemanticSegmentationHead.__init__  s    ,,MMIIhaONN8$GGIJJv99:IIh 1 1qAKKQZtL
	r*   r0   ro   c                 Z    || j                   j                     }| j                  |      }|S r   )r4   r  r  rQ   r0   logitss      r+   r   z#DPTSemanticSegmentationHead.forward  s)    %dkk&?&?@=)r*   )
r!   r"   r#   r   r<   r  r%   r   r   r   r   s   @r+   r  r    s/    
y 
T%,,%7 ELL r*   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )DPTAuxiliaryHeadr4   c                 X   t         |           |j                  }t        j                  t        j
                  ||ddd      t        j                  |      t        j                         t        j                  dd      t        j
                  ||j                  d            | _
        y )Nr   r   Fr  g?r9   )r;   r<   r9  r   r  rK   r=  r7  r   r  r  r  s      r+   r<   zDPTAuxiliaryHead.__init__%  sv    ,,MMIIhaONN8$GGIJJsE"IIh 1 1qA
	r*   r0   ro   c                 (    | j                  |      }|S r   )r  r  s      r+   r   zDPTAuxiliaryHead.forward1  s    =)r*   r   r   s   @r+   r  r  $  s*    

y 

U\\ ell r*   r  c                        e Zd Zdef fdZee	 	 d	dej                  dz  dej                  dz  de
e   defd              Z xZS )
DPTForSemanticSegmentationr4   c                     t         |   |       t        |d      | _        t	        |      | _        t        |      | _        |j                  rt        |      nd | _
        | j                          y r  )r;   r<   rf  rO  r  r  r  r  use_auxiliary_headr  auxiliary_headrm  r   s     r+   r<   z#DPTForSemanticSegmentation.__init__8  s^     Fe< FO	 07	:@:S:S.v6Y] 	r*   Nrm   r  r   ro   c                 @    |$ j                   j                  dk(  rt        d      |j                  d      xs t	         j                   dd      }d|d<     j
                  |fi |}|j                  } j                   j                  s:t        |dd       D cg c]   \  }}| j                   j                  v s|" }}}n4|j                  }	|	j                   fdt        |dd       D               |	} j                  |      } j                  |      }
d} j                   j                  |d	         }d}|t        j                   j#                  |
|j$                  d
d dd      }|0t        j                   j#                  ||j$                  d
d dd      }t'         j                   j(                        } |||      } ||      }| j                   j*                  |z  z   }t-        ||
|r|j                  nd|j.                        S c c}}w )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DPTForSemanticSegmentation
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large-ade")
        >>> model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oner~   FTc              3   `   K   | ]%  \  }}|j                   j                  d d v s"| ' ywr  r  r  s      r+   r  z5DPTForSemanticSegmentation.forward.<locals>.<genexpr>|  s7      *(CCSWS^S^SsSstutvSwLw*s   #..)r0   r7   r8   rZ   rI  )ignore_index)r  r  r0   r1   )r4   r  rH   r  r  rO  r0   r   r  r  r    r  r  r  r  r   ra   rb   rs   r   semantic_loss_ignore_indexauxiliary_loss_weightr   r1   )rQ   rm   r  r   r  r  r0   r  r  r  r  auxiliary_logitsr  upsampled_logitsupsampled_auxiliary_logitsloss_fct	main_lossauxiliary_losss   `                 r+   r   z"DPTForSemanticSegmentation.forwardG  s<   @ $++"8"8A"=NOO (.zz2H'I (
WKK/N
$ *.%&HPQ]HhagHh-- {{$$,5mAB6G,H(CCSWS^S^SsSsLsM  &-%E%E"")) *,5mAB6G,H*  3M			>=)*#22=3DE!}}88V\\"#.Zu  9    +-/]]-F-F$6<<+<:]b .G .* (T[[5[5[\H !16:I%&@&INt{{@@>QQD&3O'//UY))	
 	
Es    H>Hr  )r!   r"   r#   r   r<   r   r   r%   r&   r  r   r   r   r   r   r   s   @r+   r  r  6  s{    y   26*.U
''$.U
   4'U
 +,	U

 
!U
  U
r*   r  )r  r  rf  rN  )Nr   )Lr$   collections.abcrB   r   dataclassesr   r%   r   torch.nnr    r   rS  activationsr	   backbone_utilsr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_dptr   
get_loggerr!   loggerr   r-   Moduler3   r   r   r   floatr   r   r   r   r   r   r   r   r  r  r)  r2  r+  rN  r`  rf  rk  r  r  r  r  r  r  __all__r)   r*   r+   <module>r     s    $ !   % & ! + 9 ^ ^ F & X X I 5 ( 
		H	% 	J 	J 	J J; J J$]
RYY ]
@4Yryy 4YnBII L !%II%<<% 
% <<	%
 LL4'% T\% % '(%:4.ryy 4.pryy $bii "  
299 
, De eP" ,#BII #0:'RYY :'z"BII "J 4 4 40@BII @ 1
! 1
 1
j299 7bii 7t%RYY %P 
p
. p

p
f")) ,ryy $ g
!3 g
 g
T dr*   