
    iR                     |   d dl Zd dl mZ d dlZd dlmZ ddlmZ ddlm	Z	 ddl
mZmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlm Z  ddl!m"Z"  G d dejF                        Z$ G d dejF                        Z%	 	 d:dejF                  dejL                  dejL                  dejL                  dejL                  dz  de'dz  de'dee   fdZ( G d dejF                        Z) G d  d!ejF                        Z* G d" d#ejF                        Z+d;d$ejL                  d%e'd&e,d'ejL                  fd(Z- G d) d*ejF                        Z. G d+ d,ejF                        Z/ G d- d.e      Z0e G d/ d0e             Z1 G d1 d2e1      Z2e G d3 d4e1             Z3 ed56       G d7 d8ee1             Z4g d9Z5y)<    N)Callable)nn   )initialization)ACT2FN)BackboneMixinfilter_output_hidden_states)GradientCheckpointingLayer)BackboneOutputBaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring
is_tracing)can_return_tuplemerge_with_config_defaults)capture_outputs   )PixioConfigc                   f     e Zd ZdZdef fdZddej                  dedej                  fdZ	 xZ
S )	PixioPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    configc                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        t        j                  ||||      | _        y )Nr   r   )kernel_sizestride)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesr   Conv2d
projection)selfr   r!   r"   r#   r$   r)   	__class__s          y/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/pixio/modeling_pixio.pyr    zPixioPatchEmbeddings.__init__/   s    !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&))L+:^hi    pixel_valuesinterpolate_pos_encodingreturnc                    |j                   \  }}}}|| j                  k7  rt        d| j                   d| d      |sV|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d		      | j	                  |      j                  d
      j                  dd
      }|S )NzoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r   zInput image size (*z) doesn't match model (z).   )shaper#   
ValueErrorr!   r+   flatten	transpose)r,   r0   r1   
batch_sizer#   heightwidth
embeddingss           r.   forwardzPixioPatchEmbeddings.forward>   s    2>2D2D/
L&%4,,,!../yaI  (++u8J/J (% 9+,Adooa.@-AE  __\2::1=GG1M
r/   )F)__name__
__module____qualname____doc__r   r    torchTensorboolr?   __classcell__r-   s   @r.   r   r   (   s;    j{ jELL D ]b]i]i r/   r   c                        e Zd ZdZdeddf fdZdej                  dededej                  fd	Z	d
ej                  dej                  fdZ
 xZS )PixioEmbeddingszB
    Construct the CLS tokens, position and patch embeddings.
    r   r2   Nc                 (   t         |           t        j                  t	        j
                  d|j                  |j                              | _        d | _	        t        |      | _        | j                  j                  }t        j                  t	        j
                  d||j                  z   |j                              | _        t        j                  |j                        | _        |j                  | _        |j"                  | _        || _        y )Nr   )r   r    r   	ParameterrD   randnn_cls_tokensr$   	cls_token
mask_tokenr   patch_embeddingsr)   position_embeddingsDropouthidden_dropout_probdropoutr"   r   )r,   r   r)   r-   s      r.   r    zPixioEmbeddings.__init__T   s    ekk!V5H5H&J\J\&]^ 4V <++77#%<<A{VM`M`?`bhbtbt0u#v zz&"<"<="// ++r/   r>   r<   r=   c                 @   |j                   d   | j                  z
  }| j                  j                   d   | j                  z
  }t               s||k(  r||k(  r| j                  S | j                  ddd| j                  f   }| j                  dd| j                  df   }|j                   d   }|| j                  z  }	|| j                  z  }
t        |dz        }|j                  d|||      }|j                  dddd      }|j                  }t        j                  j                  |j                  t        j                        |	|
fdd	
      j                  |      }|j                  dddd      j                  dd|      }t        j                   ||fd      S )a#  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support tracing and interpolation at torch.float32 precision.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r   r6   bicubicF)sizemodealign_cornersdtypedim)r7   rN   rR   r   r"   intreshapepermuter]   r   
functionalinterpolatetorD   float32viewcat)r,   r>   r<   r=   r)   num_positionsclass_pos_embedpatch_pos_embedr_   
new_height	new_widthsqrt_num_positionstarget_dtypes                r.   r1   z(PixioEmbeddings.interpolate_pos_encodinga   s    !&&q)D,=,==0066q9D<M<MM|} <5+++2216I8I8I6I3IJ221d6G6G6I3IJr"t.
T__,	 !34)11!5GI[]`a)11!Q1=&,,--33u}}-i(	 4 

 "<"
  	 *11!Q1=BB1b#Nyy/?;CCr/   r0   c                 x   |j                   \  }}}}| j                  j                  j                  j                  }| j                  |j                  |            }| j                  j                  |dd      }t        j                  ||fd      }|| j                  |||      z   }| j                  |      }|S )Nr\   rW   r   r^   )r7   rQ   r+   weightr]   re   rO   expandrD   rh   r1   rU   )	r,   r0   r;   _r<   r=   ro   r>   
cls_tokenss	            r.   r?   zPixioEmbeddings.forward   s    '3'9'9$
Avu,,77>>DD**<???+NO
^^**:r2>
YY
J7Q?
$"?"?
FTY"ZZ
\\*-
r/   )r@   rA   rB   rC   r   r    rD   rE   r`   r1   r?   rG   rH   s   @r.   rJ   rJ   O   si    { t $D5<< $D $DUX $D]b]i]i $DLELL U\\ r/   rJ   modulequerykeyvalueattention_maskscalingrU   kwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )NrW         r6   r   r^   )ptrainingr   )
rY   rD   matmulr:   r   rc   softmaxrU   r   
contiguous)
ru   rv   rw   rx   ry   rz   rU   r{   attn_weightsattn_outputs
             r.   eager_attention_forwardr      s     **R.D( <<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r/   c                        e Zd Zdef fdZdej                  dee   de	ej                  ej                  f   fdZ
 xZS )PixioSelfAttentionr   c                 2   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        |j                  | _        | j                  dz  | _        d| _        t        j                  |j                  | j                  |j                         | _        t        j                  |j                  | j                  |j                         | _        t        j                  |j                  | j                  |j                         | _        y )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r4   r}   Fbias)r   r    r$   num_attention_headshasattrr8   r   r`   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probrz   	is_causalr   Linearqkv_biasrv   rw   rx   r,   r   r-   s     r.   r    zPixioSelfAttention.__init__   sF    : ::a?PVXhHi"6#5#5"6 7334A7 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r/   hidden_statesr{   r2   c                    |j                   d   }|d| j                  | j                  f} | j                  |      j                  | j                  dd      } | j                  |      j                  | j                  dd      } | j                  |      j                  | j                  dd      }t        j                  | j                  j                  t              } || |||d f| j                  | j                  | j                  sdn| j                   d|\  }	}
|	j#                         d d | j$                  fz   }|	j'                  |      }	|	|
fS )Nr   rW   r   r6           )r   rz   rU   )r7   r   r   rw   rg   r:   rx   rv   r   get_interfacer   _attn_implementationr   r   rz   r   r   rY   r   ra   )r,   r   r{   r;   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapes               r.   r?   zPixioSelfAttention.forward   sY   
 #((+
D$<$<d>V>VV	0DHH]+00)<FFq!L	4djj/44i@JJ1aP4djj/44i@JJ1aP(?(M(MKK,,.E)
 *=
*
 nnLL#}}C$2C2C
*
 
*
& #0"4"4"6s";t?Q?Q>S"S%--.EFo--r/   )r@   rA   rB   r   r    rD   rE   r   r   tupler?   rG   rH   s   @r.   r   r      sN    ]{ ](.||. +,. 
u||U\\)	*	.r/   r   c                   x     e Zd ZdZdef fdZdej                  dej                  dej                  fdZ xZ	S )PixioSelfOutputz
    The residual connection is defined in PixioLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r   c                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y N)	r   r    r   r   r$   denserS   rT   rU   r   s     r.   r    zPixioSelfOutput.__init__   sB    YYv1163E3EF
zz&"<"<=r/   r   input_tensorr2   c                 J    | j                  |      }| j                  |      }|S r   )r   rU   )r,   r   r   s      r.   r?   zPixioSelfOutput.forward   s$    

=1]3r/   )
r@   rA   rB   rC   r   r    rD   rE   r?   rG   rH   s   @r.   r   r      s=    
>{ >
U\\  RWR^R^ r/   r   c                   f     e Zd Zdef fdZdej                  dee   dej                  fdZ	 xZ
S )PixioAttentionr   c                 b    t         |           t        |      | _        t	        |      | _        y r   )r   r    r   	attentionr   outputr   s     r.   r    zPixioAttention.__init__   s&    +F3%f-r/   r   r{   r2   c                 V     | j                   |fi |\  }}| j                  ||      }|S r   )r   r   )r,   r   r{   self_attn_outputrs   r   s         r.   r?   zPixioAttention.forward   s5    
 -dnn]EfE!-}=r/   r@   rA   rB   r   r    rD   rE   r   r   r?   rG   rH   s   @r.   r   r      s>    .{ .
|| +, 
	r/   r   input	drop_probr   r2   c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    r   r   r   )r   )r]   device)r7   ndimrD   randr]   r   floor_div)r   r   r   	keep_probr7   random_tensorr   s          r.   	drop_pathr   
  s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FMr/   c                   x     e Zd ZdZd	dedz  ddf fdZdej                  dej                  fdZde	fdZ
 xZS )
PixioDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r2   c                 0    t         |           || _        y r   )r   r    r   )r,   r   r-   s     r.   r    zPixioDropPath.__init__  s    "r/   r   c                 D    t        || j                  | j                        S r   )r   r   r   )r,   r   s     r.   r?   zPixioDropPath.forward   s    FFr/   c                      d| j                    S )Nzp=)r   r,   s    r.   
extra_reprzPixioDropPath.extra_repr#  s    DNN#$$r/   r   )r@   rA   rB   rC   floatr    rD   rE   r?   strr   rG   rH   s   @r.   r   r     sG    b#%$, #$ #GU\\ Gell G%C %r/   r   c                   X     e Zd Zd fdZdej
                  dej
                  fdZ xZS )PixioMLPr2   c                 ~   t         |           |j                  x}}t        |j                  |j                  z        }t        j                  ||d      | _        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  ||d      | _        y )NTr   )r   r    r$   r`   	mlp_ratior   r   fc1r%   
hidden_actr   r   
activationfc2)r,   r   in_featuresout_featureshidden_featuresr-   s        r.   r    zPixioMLP.__init__(  s    %+%7%77lf0063C3CCD99[/Ef''-$V%6%67DO$//DO99_lFr/   hidden_statec                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )r,   r   s     r.   r?   zPixioMLP.forward3  s2    xx-|4xx-r/   )r2   N)r@   rA   rB   r    rD   rE   r?   rG   rH   s   @r.   r   r   '  s$    	GELL U\\ r/   r   c                   j     e Zd Zdeddf fdZdej                  dee   dej                  fdZ	 xZ
S )
PixioLayerr   r2   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |      | _        |j                  dkD  rt        |j                        nt        j                         | _        t        j                  |j                  |j
                        | _        t        |      | _        y )Nepsr   )r   r    r   	LayerNormr$   layer_norm_epsnorm1r   r   drop_path_rater   Identityr   norm2r   mlpr   s     r.   r    zPixioLayer.__init__;  s    \\&"4"4&:O:OP
'/AGAVAVY\A\v'<'<=bdbmbmbo\\&"4"4&:O:OP
F#r/   r   r{   c                     | j                  |      } | j                  |fi |}| j                  |      |z   }| j                  |      }| j	                  |      }| j                  |      |z   }|S r   )r   r   r   r   r   )r,   r   r{   hidden_states_normself_attention_outputlayer_outputs         r.   r?   zPixioLayer.forwardE  sq    !ZZ6 ./A LV L'<=Mzz-0xx-~~l3mCr/   r   rH   s   @r.   r   r   :  sA    ${ $t $U\\ VDV=W \a\h\h r/   r   c                       e Zd ZU eed<   dZdZdZdZddgZ	dZ
dZdZdZeedZ ej$                         d	ej(                  ej*                  z  ej,                  z  fd
       Zy)PixioPreTrainedModelr   pixior0   )imageTrJ   r   )r   
attentionsru   c                 $   t        |t        j                  t        j                  z        rct	        j
                  |j                  d| j                  j                         |j                   t	        j                  |j                         yyt        |t        j                        r?t	        j                  |j                         t	        j                  |j                         yt        |t              rt	        j
                  |j                  d| j                  j                         t	        j
                  |j                  d| j                  j                         |j                    t	        j                  |j                          yyy)zInitialize the weightsr   )meanstdN)r%   r   r   r*   inittrunc_normal_rq   r   initializer_ranger   zeros_r   ones_rJ   rR   rO   rP   )r,   ru   s     r.   _init_weightsz"PixioPreTrainedModel._init_weightsd  s     fbii"))34v}}3DKK<Y<YZ{{&FKK( '-KK$JJv}}%0v99IfIfgv//ct{{?\?\]  ,F--. - 1r/   N)r@   rA   rB   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsrD   no_gradr   r   r*   r   r    r/   r.   r   r   S  s    $O!&*#*L9N"&#(
 U]]_/BII		$9BLL$H / /r/   r   c                   t     e Zd Zdef fdZe ed      dej                  de	e
   defd              Z xZS )	PixioEncoderr   c                     t         |   |       t        j                  t	        |j
                        D cg c]  }t        |       c}      | _        d| _        | j                          y c c}w )NF)
r   r    r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing	post_initr,   r   rs   r-   s      r.   r    zPixioEncoder.__init__v  sS     ]]fF^F^@_#`1Jv$6#`a
&+# $as   A-F)tie_last_hidden_statesr   r{   r2   c                 N    | j                   D ]  } ||fi |} t        |      S )N)last_hidden_state)r  r   )r,   r   r{   layer_modules       r.   r?   zPixioEncoder.forward|  s5     !JJ 	BL(A&AM	B ??r/   )r@   rA   rB   r   r    r   r   rD   rE   r   r   r   r?   rG   rH   s   @r.   r  r  u  sU    {   E2@U\\ @VDV=W @\k @ 3  @r/   r  c            	       |     e Zd Zdef fdZdefdZee	 d	de	j                  dz  dee   defd              Z xZS )

PixioModelr   c                     t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y )Nr   )r   r    r   rJ   r>   r  encoderr   r   r$   r   	layernormr  r   s     r.   r    zPixioModel.__init__  sW     )&1#F+f&8&8f>S>STr/   r2   c                 .    | j                   j                  S r   r>   rQ   r   s    r.   get_input_embeddingszPixioModel.get_input_embeddings      ///r/   Nr0   r{   c                 H   |t        d      | j                  |      } | j                  |fi |}|j                  }| j	                  |      }|d d d | j                  j
                  d d f   j                  d      }t        |||j                  |j                        S )Nz You have to specify pixel_valuesr   r^   )r  pooler_outputr   r   )
r8   r>   r  r  r  rN   r   r   r   r   )r,   r0   r{   embedding_outputencoder_outputssequence_outputpooled_outputs          r.   r?   zPixioModel.forward  s     ?@@??<8+74<<8H+SF+S);;..9'+IT__-I-I+I1(LMRRWXRY)-')77&11	
 	
r/   r   )r@   rA   rB   r   r    r   r  r   r   rD   rE   r   r   r   r?   rG   rH   s   @r.   r  r    sh    	{ 	0&: 0  -1
llT)
 +,
 
$	
  
r/   r  zN
    Pixio backbone, to be used with frameworks like DETR and MaskFormer.
    )custom_introc            	       v     e Zd Z fdZdefdZeeede	j                  dee   defd                     Z xZS )PixioBackbonec                 X   t         |   |       t        |j                  dz         D cg c]  }|j                   c}| _        t        |      | _        t        |      | _	        t        j                  |j                  |j                        | _        | j                          y c c}w )Nr   r   )r   r    r  r  r$   num_featuresrJ   r>   r  r  r   r   r   r  r  r	  s      r.   r    zPixioBackbone.__init__  s     9>v?W?WZ[?[9\]AV//])&1#F+f&8&8f>S>ST 	 ^s   B'r2   c                 .    | j                   j                  S r   r  r   s    r.   r  z"PixioBackbone.get_input_embeddings  r  r/   r0   r{   c                    d|d<   | j                  |      } | j                  |fi |}|j                  }g }t        | j                  |      D ]  \  }}|| j
                  v s| j                  j                  r| j                  |      }| j                  j                  r|dd| j                   j                  df   }|j                  \  }	}
}}| j                  j                  }|j                  |	||z  ||z  d      }|j                  dddd      j                         }|j!                  |        t#        t%        |      ||j&                  	      S )
aw  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> processor = AutoImageProcessor.from_pretrained("facebook/pixio-huge")
        >>> model = AutoBackbone.from_pretrained(
        ...     "facebook/pixio-huge", out_features=["stage7", "stage15", "stage23", "stage31"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 1280, 16, 16]
        ```Toutput_hidden_statesNrW   r   r   r   r6   )feature_mapsr   r   )r>   r  r   zipstage_namesr   r   apply_layernormr  reshape_hidden_statesrN   r7   r"   ra   rb   r   appendr   r   r   )r,   r0   r{   r  r   r   r%  stager   r;   rs   r<   r=   r"   s                 r.   r?   zPixioBackbone.forward  sM   < *.%&??<8".$,,/?"J6"J,,#&t'7'7#G 
	2E<)));;..#'>>,#?L;;44#/4??3O3O3Q0Q#RL3?3E3E0J65!%!7!7J#/#7#7
FjDXZ_cmZmoq#rL#/#7#71a#C#N#N#PL##L1
	2 |,'((
 	
r/   )r@   rA   rB   r    r   r  r   r	   r   rD   rE   r   r   r   r?   rG   rH   s   @r.   r  r    sY    
0&: 0  2
ELL 2
FCU<V 2
[i 2
  ! 2
r/   r  )r  r   r  )Nr   )r   F)6collections.abcr&   r   rD   r    r   r   activationsr   backbone_utilsr   r	   modeling_layersr
   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   utils.output_capturingr   configuration_pixior   Moduler   rJ   rE   r   r   r   r   r   rF   r   r   r   r   r   r  r  r  __all__r   r/   r.   <module>r:     s  *  $   & ! H 9 [ [ F & C C I 5 ,$299 $NDbii DZ !%II%<<% 
% <<	%
 LL4'% T\% % '(%84. 4.nbii "RYY  U\\ e T V[VbVb %BII %ryy &+ 2 /? / /B@' @  %
% %
 %
P 
E
M#7 E

E
P Br/   