
    iS                        d Z ddlmZ ddlZddlmZ ddlmZ ddlmZ	 ddl
mZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZmZmZmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$  ed      e G d de                    Z% G d ded      Z& G d de!      Z' G d de#      Z( G d d ejR                        Z* G d! d"e      Z+ G d# d$ejR                        Z, G d% d&ejR                        Z- G d' d(ejR                        Z. G d) d*ejR                        Z/e G d+ d,e             Z0 ed-.       G d/ d0e0             Z1g d1Z2y)2u9   CHMv2 model — Canopy Height Model v2, adapted from DPT.    )LiteralN)strict)nn   )initialization)%consolidate_backbone_kwargs_to_configload_backbone)PreTrainedConfig)DepthEstimatorOutput)PreTrainedModel)ImagesKwargsUnpack)
TensorTypeTransformersKwargsauto_docstringcan_return_tuplerequires_backends   )
AutoConfig) DepthAnythingPreActResidualLayer)DPTImageProcessor)DPTReassembleLayer_get_backbone_hidden_sizez%facebook/dinov3-vitl16-chmv2-dpt-head)
checkpointc                       e Zd ZU dZdZdeiZdZee	z  dz  e
d<   dZee
d<   dZee
d<   dZeeez     dz  e
d	<   dZee   dz  e
d
<   dZee
d<   dZee
d<   dZee
d<   dZee
d<   dZee
d<   dZee
d<   dZed   e
d<   dZed   e
d<    fdZ xZS )CHMv2Configa  
    backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*):
        The configuration of the backbone model. Only DINOv3ViTConfig is currently supported.
    patch_size (`int`, *optional*, defaults to 16):
        The patch size used by the backbone vision transformer.
    reassemble_factors (`list[float]`, *optional*, defaults to `[4, 2, 1, 0.5]`):
        The up/downsampling factors of the reassemble layers.
    post_process_channels (`list[int]`, *optional*, defaults to `[128, 256, 512, 1024]`):
        The output channel sizes of the reassemble stage for each backbone feature level.
    fusion_hidden_size (`int`, *optional*, defaults to 256):
        The number of channels before fusion.
    head_hidden_size (`int`, *optional*, defaults to 128):
        The number of channels in the hidden layer of the depth estimation head.
    number_output_channels (`int`, *optional*, defaults to 256):
        Number of output channels for the CHMv2 head (number of depth bins).
    readout_type (`str`, *optional*, defaults to `"project"`):
        Type of readout operation for the CLS token. One of `["ignore", "add", "project"]`.
    min_depth (`float`, *optional*, defaults to 0.001):
        The minimum depth value for depth bin calculation.
    max_depth (`float`, *optional*, defaults to 96.0):
        The maximum depth value for depth bin calculation.
    bins_strategy (`str`, *optional*, defaults to `"chmv2_mixlog"`):
        The strategy for depth bins distribution. One of `["linear", "log", "chmv2_mixlog"]`.
    norm_strategy (`str`, *optional*, defaults to `"chmv2_mixlog"`):
        The normalization strategy for depth prediction. One of `["linear", "softmax", "sigmoid", "chmv2_mixlog"]`.

    ```python
    >>> from transformers import CHMv2Config, CHMv2ForDepthEstimation

    >>> configuration = CHMv2Config()
    >>> model = CHMv2ForDepthEstimation(configuration)
    >>> configuration = model.config
    ```
    chmv2backbone_configN   
patch_sizeg{Gz?initializer_rangereassemble_factorspost_process_channels   fusion_hidden_size   head_hidden_sizenumber_output_channelsprojectreadout_typegMbP?	min_depthg      X@	max_depthchmv2_mixlog)linearlogr-   bins_strategy)r.   softmaxsigmoidr-   norm_strategyc                     | j                   	g d| _         | j                  	g d| _        ddddddd	g d
d	d	dd	d}t        d| j                  d|d|\  | _        }t	        |   di | y )N)   r      g      ?)r&   r$   i      i  r7   i   r      r5   T)         r8   gư>)
image_sizehidden_sizeintermediate_sizenum_attention_headsnum_hidden_layersnum_register_tokenskey_biasout_indicesreshape_hidden_statesapply_layernormlayer_norm_epsreturn_class_token
dinov3_vit)r   default_config_typedefault_config_kwargs )r"   r#   r   r   super__post_init__)selfkwargsrJ   	__class__s      x/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/chmv2/modular_chmv2.pyrM   zCHMv2Config.__post_init__\   s    ""*&4D#%%-)>D& !%#%!##$*%)#""&!
 (M (
 00 ,"7(
 	(
$f 	''    )__name__
__module____qualname____doc__
model_typer   sub_configsr   dictr
   __annotations__r    intr!   floatr"   listr#   r%   r'   r(   r*   strr+   r,   r0   r   r3   rM   __classcell__rP   s   @rQ   r   r   %   s    !F J$j1K6:OT,,t3:J#u#37US[)D07.249t+2!!c"%C%!L#!IuIu>LM7:;LM[M7IJ[( (rR   r   c                   :    e Zd ZU dZeed<   eed<   eed<   eed<   y)CHMv2ImageProcessorKwargsa=  
    ensure_multiple_of (`int`, *optional*, defaults to 1):
        If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
        by `ensure_multiple_of` in `preprocess`.
    keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
        If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
        be overridden by `keep_aspect_ratio` in `preprocess`.
    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
        is used for background, and background itself is not included in all classes of a dataset (e.g.
        ADE20k). The background label will be replaced by 255.
    ensure_multiple_ofsize_divisorkeep_aspect_ratiodo_reduce_labelsN)rS   rT   rU   rV   r[   rZ   boolrK   rR   rQ   rb   rb   {   s!     rR   rb   F)totalc            
       ~    e Zd ZdZdZdZdZdZg dZg dZ	e
Z	 dddd	eeeeef      z  dz  dz  d
eeeef      fdZy)CHMv2ImageProcessorFTr   )gzG?gM?gl?)g$C?g+?gM?Noutputsr   target_sizesreturnc                 v   t        | d       |j                  }|"t        |      t        |      k7  rt        d      g }|dgt        |      z  n|}t	        ||      D ]X  \  }}|>t
        j                  j                  j                  |d   |dd      j                         }|j                  d|i       Z |S )	a  
        Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
        Only supports PyTorch.

        Args:
            outputs ([`DepthEstimatorOutput`]):
                Raw outputs of the model.
            target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*):
                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                (height, width) of each image in the batch. If left to None, predictions will not be resized.

        Returns:
            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
            predictions.
        torchNz]Make sure that you pass in as many target sizes as the batch dimension of the predicted depth)NN.bilinearTsizemodealign_cornerspredicted_depth)r   ru   len
ValueErrorzipro   r   
functionalinterpolatesqueezeappend)rN   rk   rl   ru   resultsdepthtarget_sizes          rQ   post_process_depth_estimationz1CHMv2ImageProcessor.post_process_depth_estimation   s    ( 	$(!11$3+?3|CT+To  8D8LvO 44R^"%o|"D 	7E;&++77/*:]a 8 ')  NN-u56	7 rR   N)rS   rT   rU   	do_resizedo_padrd   rc   re   
image_mean	image_stdrb   valid_kwargsr   r]   tupler[   rY   r^   r   rK   rR   rQ   rj   rj      s~    IFL&J%I,L
 JN''' !4c3h#884?$F' 
d3
?#	$	'rR   rj   c                       e Zd Zy)CHMv2ReassembleLayerNrS   rT   rU   rK   rR   rQ   r   r          rR   r   c                   n     e Zd ZdZdef fdZddeej                     deej                     fdZ	 xZ
S )CHMv2ReassembleStagez
    Reassemble stage that processes hidden states from the backbone into image-like feature
    representations at various resolutions.
    configc           	         t         |           || _        |j                  | _        t	        j
                         | _        t        |j                  |j                        D ],  \  }}| j                  j                  t        |||             . t        |      }| j                  dk(  rt	        j
                         | _        t        t        | j                              D ]Z  }| j                  j                  t	        j                   t	        j"                  d|z  |      t	        j$                                      \ y y )N)r   channelsfactorr)   r   )rL   __init__r   r*   r   
ModuleListlayersrx   r#   r"   r|   r   r   readout_projectsrangerv   
SequentialLinearGELU)rN   r   out_channelsr   r=   _rP   s         rQ   r   zCHMv2ReassembleStage.__init__   s    "//mmo$'(D(DfF_F_$` 	 L&KK$!)!	 07	)$&MMOD!3t{{+, p%%,,R]]299Q_Va;bdfdkdkdm-nop *rR   hidden_statesrm   c                    g }t        |      D ]  \  }}t        |t        t        f      rt	        |      dk(  r|d   |d   }}|j
                  }| j                  dk(  r|j                  d      j                  dd      }|j                  d      j                  |      }	 | j                  |   t        j                  ||	fd            }|j                  ddd      j                  |      }n| j                  dk(  r|j                  d      |j                  d      z   }|j                  |      }nd|j!                         dk(  rQ|d d dd f   }|j
                  \  }
}}|j                  |
|||      }|j                  dddd      j#                         } | j$                  |   |      }|j'                  |        |S )Nr   r   r6   r)   addr   )	enumerate
isinstancer   r]   rv   shaper*   flatten	transpose	unsqueeze	expand_asr   ro   catpermutereshapedim
contiguousr   r|   )rN   r   patch_heightpatch_widthout	layer_idxhidden_state	cls_tokenfeature_shapereadout
batch_sizer   num_channelss                rQ   forwardzCHMv2ReassembleStage.forward   s   '0'? 	%#I|,63|;LPQ;Q*6q/<?i , 2 2$$	1#/#7#7#:#D#DQ#JL'11!4>>|LG#C4#8#8#CEII|]dNegiDj#kL#/#7#71a#@#H#H#WL&&%/#/#7#7#:Y=P=PQS=T#TL#/#7#7#FL##%*#/12#6L2>2D2D/J<#/#7#7
LR]_k#lL#/#7#71a#C#N#N#PL14;;y1,?LJJ|$+	%. 
rR   NN)rS   rT   rU   rV   r   r   r]   ro   Tensorr   r_   r`   s   @rQ   r   r      s?    
p{ p*T%,,%7 aefkfrfras rR   r   c                       e Zd Zy)CHMv2PreActResidualLayerNr   rK   rR   rQ   r   r      r   rR   r   c                   2     e Zd Zddedef fdZddZ xZS )CHMv2FeatureFusionLayerr   is_first_layerc                     t         |           || _        t        j                  |j
                  |j
                  dd      | _        |st        |      | _        t        |      | _	        y )Nr6   T)kernel_sizebias)
rL   r   r   r   Conv2dr%   
projectionr   residual_layer1residual_layer2)rN   r   r   rP   s      rQ   r   z CHMv2FeatureFusionLayer.__init__  sW    ,))F$=$=v?X?Xfgnrs#;F#CD 7?rR   c                    |o| j                   sc|j                  |j                  k7  r6|j                  \  }}}}t        j                  j	                  |||fdd      }|| j                  |      z   }| j                  |      }|ddind|i}t        j                  j                  |fi |ddd}| j                  |      }|S )	Nrp   Frq   scale_factorr   rr   T)rs   rt   )r   r   r   ry   rz   r   r   r   )rN   r   residualrr   r   heightwidthmodifiers           rQ   r   zCHMv2FeatureFusionLayer.forward  s    (;(;!!X^^3&2&8&8#1fe==44FE?SX 5  ($*>*>x*HHL++L9*.,NA&VTN}}00

 	
 |4rR   )Fr   )rS   rT   rU   r   rg   r   r   r_   r`   s   @rQ   r   r     s    	@{ 	@D 	@rR   r   c                   *     e Zd ZdZd fd	Zd Z xZS )CHMv2UpsampleConvHeadz
    Convolutional head with intermediate upsampling.

    Architecture: Conv3x3 -> 2x bilinear upsample -> Conv3x3 -> ReLU -> Conv1x1.
    c                 L   t         |           t        j                  t        j                  ||dz  ddd      t        j
                  ddd      t        j                  |dz  |ddd      t        j                         t        j                  ||ddd      g      | _        y )	Nr   r   r6   )r   stridepaddingrp   T)r   rs   rt   r   )rL   r   r   r   r   UpsampleReLUhead)rN   featuresr(   n_hidden_channelsrP   s       rQ   r   zCHMv2UpsampleConvHead.__init__/  s    MM		(HMqTUV4P		(a-):RS]^_			+-CQR[\fgh
	rR   c                 8    | j                   D ]
  } ||      } |S r   )r   )rN   r   layers      rQ   r   zCHMv2UpsampleConvHead.forward;  s%    YY 	1E!-0M	1rR   )r&   )rS   rT   rU   rV   r   r   r_   r`   s   @rQ   r   r   (  s    

rR   r   c                        e Zd ZdZdef fdZdeej                     de	de	dej                  fdZ
deej                     de	de	dej                  fd	Z xZS )
	CHMv2Headz
    CHMv2 dense-prediction head adapted from DPT.

    Integrates reassemble, projection convs, feature fusion, and UpConv depth head.
    r   c           
      J   t         |           || _        t        |      | _        t        j                         | _        |j                  D ]?  }| j                  j                  t        j                  ||j                  ddd             A t        j                         | _        t        t        |j                              D ]+  }| j                  j                  t        ||dk(               - t!        |j                  |j"                  |j$                        | _        y )Nr   r6   F)r   r   r   r   )r   )r   r(   r   )rL   r   r   r   reassemble_stager   r   convsr#   r|   r   r%   fusion_layersr   rv   r   r   r(   r'   
conv_depth)rN   r   channelidxrP   s       rQ   r   zCHMv2Head.__init__H  s     4V <]]_
33 	sGJJbii1J1JXYcdkpqr	s  ]]_V99:; 	bC%%&=fVY]^V^&`a	b 0..#)#@#@$55
rR   r   r   r   rm   c                 b   | j                  |||      }t        |      D cg c]  \  }} | j                  |   |       }}}|j                           | j                  d   |d         }t        dt        | j                              D ]  } | j                  |   |||         } |S c c}}w )Nr   r6   )r   r   r   reverser   r   rv   )rN   r   r   r   ifeaturer   fused_hidden_states           rQ   forward_featureszCHMv2Head.forward_features\  s    --m\;W=F}=UVzq'MDJJqM'*VV2T//28A;?q#d0012 	XA!6!3!3A!67I8TU;!W	X "! Ws   B+c                 N    | j                  |||      }| j                  |      }|S r   )r   r   )rN   r   r   r   r   s        rQ   r   zCHMv2Head.forwardh  s)    ##M<Mooc"
rR   )rS   rT   rU   rV   r   r   r]   ro   r   r[   r   r   r_   r`   s   @rQ   r   r   A  sy    
{ 
(
"d5<<.@ 
"PS 
"be 
"jojvjv 
"T%,,%7 s Y\ afamam rR   r   c                        e Zd ZdZdef fdZdedej                  dej                  fdZ
dej                  d	ej                  dej                  fd
Zdej                  dej                  fdZ xZS )CHMv2FeaturesToDepthzJConverts raw logits from the CHMv2 head into a depth map using depth bins.r   c                     t         |           |j                  | _        |j                  | _        |j                  | _        |j
                  | _        d| _        d| _        d| _        y )Ng-C6?g:0yE>g-q=)	rL   r   r+   r,   r0   r3   _mixlog_max_clamp_value_mixlog_eps_shift_mixlog_epsrN   r   rP   s     rQ   r   zCHMv2FeaturesToDepth.__init__q  s\    ))))#11#11'+$!% rR   n_binsdevicerm   c                    | j                   dz  }t        j                  | j                  |||      }t        j                  t        j                  t        j
                  t        j                  | j                  |            t        j
                  t        j                  ||            ||            }t        j                  dd||      }||z  d|z
  |z  z   }|S )z
        Creates mixed log bins interpolated between linear and log distributions.

        The max_depth is divided by 8.0 internally; this scaling is reversed in
        `_create_outputs_with_mixlog_norm` by multiplying by 8.0.
               @r         ?        )r,   ro   linspacer+   expr/   tensor)rN   r   r   scaled_max_depthr.   r/   interp_weightbinss           rQ   _create_mixlog_binsz(CHMv2FeaturesToDepth._create_mixlog_bins{  s      >>C/0@&QWXiiNN		%,,t~~fEF		%,,'7GH	
 sCGs"cM&9V%CCrR   inputr   c                 &   t        j                  |      }|j                  dd      }| j                  d      j	                  | j
                        | j                  z   }||z   }|j                  dd      }t        j                  |ddd      j                  | j                        }||z  }|j                  dddd      j                  | j                        }	||	z  j                  dd      j                  | j                        }
|
dz  }
|
S )	zEConverts depth bin logits to depth values using mixlog normalization.r6   Tr   keepdimr   r   )nanposinfneginfr   r   )ro   reluamin	clamp_min	clamp_maxr   r   sum
nan_to_numr   view)rN   r   r   logitsmin_per_sampleshift
logits_posdenomweightsbins_broadcastoutputs              rQ    _create_outputs_with_mixlog_normz5CHMv2FeaturesToDepth._create_outputs_with_mixlog_norm  s    E"D9 ++C0::4;W;WX[_[q[qqe^
1d3  CCHRRSWScScdu$1b!Q/99$:J:JKN*//At/DNNtO_O_`#rR   xc                 J   |j                   d   }|dkD  r| j                  dk(  r8t        j                  | j                  | j
                  ||j                        }n| j                  dk(  rt        j                  t        j                  t        j                  | j                              t        j                  t        j                  | j
                              ||j                        }t        j                  |      }n| j                  ||j                        }| j                  dv r| j                  dk(  r3t        j                  |      }d}||z   }||j                  dd      z  }nR| j                  d	k(  rt        j                  |d
      }n+t        j                  |      }||j                  dd      z  }t        j                   d||g      j#                  d
      }|S | j%                  ||      }|S t        j                  |      | j                  z   }|S )Nr6   r.   r   r/   )r.   r1   r2   g?Tr   r1   r   zikmn,k->imn)r   r0   ro   r   r+   r,   r   r/   r   r   r   r3   r  r  r1   r2   einsumr   r  )rN   r  r   r   logitepsr  s          rQ   r   zCHMv2FeaturesToDepth.forward  s   A:!!X-~~dnndnnfUVU]U]^##u,~~IIell4>>:;IIell4>>:;88	 yy//A!!%EE%%1!JJqMEC!CKE!EII!TI$BBE''94!MM!3E!MM!,E!EII!TI$BBEmeT]CMMRSMT 	 >>q$G  ZZ]T^^3FrR   )rS   rT   rU   rV   r   r   r[   ro   r   r   r   r  r   r_   r`   s   @rQ   r   r   n  sx    T!{ !# u||  *ell %,, [`[g[g &" "%,, "rR   r   c                   L     e Zd ZU eed<   dZdZdZdZdZ	dZ
dZdZd fdZ xZS )CHMv2PreTrainedModelr   r   pixel_values)imageTc                 \   t         |   |       t        |t        j                  t        j
                  t        j                  f      rct        j                  |j                  d| j                  j                         |j                   t        j                  |j                         y y y )Nr   )meanstd)rL   _init_weightsr   r   r   r   ConvTranspose2dinittrunc_normal_weightr   r!   r   zeros_)rN   modulerP   s     rQ   r   z"CHMv2PreTrainedModel._init_weights  st    f%fryy"))R5G5GHIv}}3DKK<Y<YZ{{&FKK( ' JrR   )rm   N)rS   rT   rU   r   rZ   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r_   r`   s   @rQ   r  r    sA    $O!&*#N"&) )rR   r  z
    CHMv2 Model with a depth estimation head on top (consisting of convolutional layers) e.g. for canopy height
    estimation.
    )custom_introc                        e Zd Zdef fdZd Zee	 d
dej                  dej                  dz  dee   defd	              Z xZS )CHMv2ForDepthEstimationr   c                     t         |   |       t        |      | _        t	        |      | _        t        |      | _        | j                          y r   )	rL   r   r	   backboner   r   r   features_to_depth	post_initr   s     rQ   r   z CHMv2ForDepthEstimation.__init__  s?     %f-f%	!5f!=rR   c                 6    | j                   j                         S r   )r3  get_input_embeddings)rN   s    rQ   r7  z,CHMv2ForDepthEstimation.get_input_embeddings  s    }}1133rR   Nr  labelsrO   rm   c                    d}|t        d      |j                  \  }}}}| j                  j                  }||z  }	||z  }
 | j                  |fi |}t        t        |j                  |j                              }| j                  ||	|
      }| j                  |      }|j                  d      }t        |||j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth depth estimation maps for computing the loss.
        NzTraining is not implemented yetr6   r  )lossru   r   
attentions)NotImplementedErrorr   r   r    r3  r]   rx   feature_maps
cls_tokensr   r4  r{   r   r   r;  )rN   r  r8  rO   r:  r   r   r   r    r   r   backbone_outputintermediate_featureshead_outputru   s                  rQ   r   zCHMv2ForDepthEstimation.forward  s     %&GHH*001fe[[++
+z)'$--?? $S)E)EGaGa%b cii 5|[Q00=)11a18#+)77&11	
 	
rR   r   )rS   rT   rU   r   r   r7  r   r   ro   FloatTensor
LongTensorr   r   r   r   r_   r`   s   @rQ   r1  r1    sr    { 4  +/ 
'' 
   4' 
 +,	 

 
 
   
rR   r1  )r   rj   r1  r  )3rV   typingr   ro   huggingface_hub.dataclassesr   r    r   r"  backbone_utilsr   r	   configuration_utilsr
   modeling_outputsr   modeling_utilsr   processing_utilsr   r   utilsr   r   r   r   r   autor   &depth_anything.modeling_depth_anythingr   dpt.image_processing_dptr   dpt.modeling_dptr   r   r   rb   rj   r   Moduler   r   r   r   r   r   r  r1  __all__rK   rR   rQ   <module>rS     sC   @   .  & R 3 4 - 4 h h  9 L BCQ(" Q(  DQ(hE (1+ 1h	- 	5299 5p	? 	"bii "JBII 2*		 *ZW299 Wt )? ) )& /
2 /
/
drR   