
    i              	          d Z ddlZddlZddlmZ ddlZddlmZmZ ddl	m
Z ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlmZ  ej@                  e!      Z"e ed       G d de                    Z#e ed       G d de                    Z$e ed       G d de                    Z%e ed       G d de                    Z&d Z'd Z(dJdej                  d e)d!e*d"ej                  fd#Z+ G d$ d%ejX                        Z- G d& d'ejX                        Z. G d( d)ejX                        Z/ G d* d+ejX                        Z0 G d, d-ejX                        Z1 G d. d/ejX                        Z2 G d0 d1ejX                        Z3 G d2 d3ejX                        Z4 G d4 d5ejX                        Z5 G d6 d7ejX                        Z6 G d8 d9e      Z7 G d: d;ejX                        Z8e G d< d=e             Z9e G d> d?e9             Z: ed@       G dA dBe9             Z; edC       G dD dEe9             Z< edF       G dG dHee9             Z=g dIZ>y)Kz!PyTorch Swinv2 Transformer model.    N)	dataclass)Tensornn   )initialization)ACT2FN)BackboneMixinfilter_output_hidden_states)GradientCheckpointingLayer)BackboneOutput)PreTrainedModel)ModelOutputauto_docstringlogging	torch_int)can_return_tuple   )Swinv2ConfigzP
    Swinv2 encoder's outputs, with potential hidden states and attentions.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZe	ej                  df   dz  ed<   dZ
e	ej                  df   dz  ed<   dZe	ej                  df   dz  ed<   y)Swinv2EncoderOutputa  
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlast_hidden_state.hidden_states
attentionsreshaped_hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   tupler   r        {/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/swinv2/modeling_swinv2.pyr   r   (   s}     37u((4/6:>M5**C/047>7;Je'',-4;CGE%"3"3S"89D@Gr%   r   zX
    Swinv2 model's outputs that also contains a pooling of the last hidden states.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)	Swinv2ModelOutputa  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
        Average pooling of the last layer hidden-state.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr   pooler_output.r   r   r   )r   r   r   r   r   r    r!   r"   r)   r   r#   r   r   r$   r%   r&   r(   r(   ?   s    	 37u((4/6.2M5$$t+2:>M5**C/047>7;Je'',-4;CGE%"3"3S"89D@Gr%   r(   z,
    Swinv2 masked image model outputs.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)	Swinv2MaskedImageModelingOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
        Masked image modeling (MLM) loss.
    reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Reconstructed pixel values.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlossreconstruction.r   r   r   )r   r   r   r   r,   r    r!   r"   r-   r   r#   r   r   r$   r%   r&   r+   r+   Y   s     &*D%

d
")/3NE%%,3:>M5**C/047>7;Je'',-4;CGE%"3"3S"89D@Gr%   r+   z2
    Swinv2 outputs for image classification.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)	Swinv2ImageClassifierOutputa7  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr,   logits.r   r   r   )r   r   r   r   r,   r    r!   r"   r0   r   r#   r   r   r$   r%   r&   r/   r/   u   s     &*D%

d
")'+FE$+:>M5**C/047>7;Je'',-4;CGE%"3"3S"89D@Gr%   r/   c                     | j                   \  }}}}| j                  |||z  |||z  ||      } | j                  dddddd      j                         j                  d|||      }|S )z2
    Partitions the given input into windows.
    r   r   r            shapeviewpermute
contiguous)input_featurewindow_size
batch_sizeheightwidthnum_channelswindowss          r&   window_partitionrB      s}     /<.A.A+J|!&&Fk);8Lk[gM ##Aq!Q15@@BGGKYdfrsGNr%   c                     | j                   d   }| j                  d||z  ||z  |||      } | j                  dddddd      j                         j                  d|||      } | S )z?
    Merges windows to produce higher resolution features.
    r5   r   r   r   r2   r3   r4   r6   )rA   r<   r>   r?   r@   s        r&   window_reverserD      sn     ==$Lll2v4e{6JKYdfrsGooaAq!Q/::<AA"feUabGNr%   input	drop_probtrainingreturnc                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

            r   r   )r   )dtypedevice)r7   ndimr    randrK   rL   floor_div)rE   rF   rG   	keep_probr7   random_tensoroutputs          r&   	drop_pathrT      s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FMr%   c                   x     e Zd ZdZd	dedz  ddf fdZdej                  dej                  fdZde	fdZ
 xZS )
Swinv2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).NrF   rH   c                 0    t         |           || _        y N)super__init__rF   )selfrF   	__class__s     r&   rZ   zSwinv2DropPath.__init__   s    "r%   r   c                 D    t        || j                  | j                        S rX   )rT   rF   rG   r[   r   s     r&   forwardzSwinv2DropPath.forward   s    FFr%   c                      d| j                    S )Nzp=)rF   r[   s    r&   
extra_reprzSwinv2DropPath.extra_repr   s    DNN#$$r%   rX   )r   r   r   r   floatrZ   r    r   r_   strrb   __classcell__r\   s   @r&   rV   rV      sG    b#%$, #$ #GU\\ Gell G%C %r%   rV   c            
            e Zd ZdZd fd	Zdej                  dededej                  fdZ	 	 dd	ej                  dz  d
ej                  dz  dedeej                     fdZ xZS )Swinv2EmbeddingszW
    Construct the patch and position embeddings. Optionally, also the mask token.
    c                 ~   t         |           t        |      | _        | j                  j                  }| j                  j
                  | _        |r4t        j                  t        j                  dd|j                              nd | _        |j                  r=t        j                  t        j                  d|dz   |j                              | _        nd | _        t        j                  |j                        | _        t        j"                  |j$                        | _        |j(                  | _        || _        y )Nr   )rY   rZ   Swinv2PatchEmbeddingspatch_embeddingsnum_patches	grid_size
patch_gridr   	Parameterr    zeros	embed_dim
mask_tokenuse_absolute_embeddingsposition_embeddings	LayerNormnormDropouthidden_dropout_probdropout
patch_sizeconfig)r[   r{   use_mask_tokenrl   r\   s       r&   rZ   zSwinv2Embeddings.__init__   s     5f =++77//99O]",,u{{1a9I9I'JKcg))')||EKK;QR?TZTdTd4e'fD$'+D$LL!1!12	zz&"<"<= ++r%   
embeddingsr>   r?   rH   c                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  z  }	|| j
                  z  }
t        |dz        }|j                  d|||      }|j                  dddd      }t        j                  j                  ||	|
fdd	
      }|j                  dddd      j                  dd|      }t        j                  ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr5         ?r   r   r2   bicubicF)sizemodealign_cornersdim)r7   rt   r    jit
is_tracingrz   r   reshaper9   r   
functionalinterpolater8   cat)r[   r}   r>   r?   rl   num_positionsclass_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss               r&   interpolate_pos_encodingz)Swinv2Embeddings.interpolate_pos_encoding   s`    !&&q)A-0066q9A= yy##%+*F6UZ?+++221bqb59221ab59r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr%   Npixel_valuesbool_masked_posr   c                    |j                   \  }}}}| j                  |      \  }}	| j                  |      }|j                         \  }
}}|K| j                  j                  |
|d      }|j                  d      j                  |      }|d|z
  z  ||z  z   }| j                  (|r|| j                  |||      z   }n|| j                  z   }| j                  |      }||	fS )Nr5         ?)r7   rk   rv   r   rr   expand	unsqueezetype_asrt   r   ry   )r[   r   r   r   _r@   r>   r?   r}   output_dimensionsr=   seq_lenmask_tokensmasks                 r&   r_   zSwinv2Embeddings.forward	  s     *6););&<(,(=(=l(K%
%YYz*
!+!2
GQ&//00WbIK",,R088ED#sTz2[45GGJ##/''$*G*G
TZ\a*bb
'$*B*BB
\\*-
,,,r%   FNF)r   r   r   r   rZ   r    r   intr   r!   
BoolTensorboolr#   r_   re   rf   s   @r&   rh   rh      s    &&D5<< &D &DUX &D]b]i]i &DV 48).	-''$.- ))D0- #'	-
 
u||	-r%   rh   c                   v     e Zd ZdZ fdZd Zdej                  dz  deej                  ee
   f   fdZ xZS )rj   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        |d   |d   z  |d   |d   z  f| _        t        j                  ||||      | _        y )Nr   r   )kernel_sizestride)rY   rZ   
image_sizerz   r@   rq   
isinstancecollectionsabcIterablerl   rm   r   Conv2d
projection)r[   r{   r   rz   r@   hidden_sizerl   r\   s          r&   rZ   zSwinv2PatchEmbeddings.__init__-  s    !'!2!2F4E4EJ
$*$7$79I9Ik#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&$Q-:a=8*Q-:VW=:XY))L+:^hir%   c                 n   || j                   d   z  dk7  rDd| j                   d   || j                   d   z  z
  f}t        j                  j                  ||      }|| j                   d   z  dk7  rFddd| j                   d   || j                   d   z  z
  f}t        j                  j                  ||      }|S )Nr   r   )rz   r   r   pad)r[   r   r>   r?   
pad_valuess        r&   	maybe_padzSwinv2PatchEmbeddings.maybe_pad<  s    4??1%%*T__Q/%$//!:L2LLMJ==,,\:FLDOOA&&!+Q4??1#5QRAS8S#STJ==,,\:FLr%   r   NrH   c                     |j                   \  }}}}| j                  |||      }| j                  |      }|j                   \  }}}}||f}|j                  d      j	                  dd      }||fS )Nr2   r   )r7   r   r   flatten	transpose)r[   r   r   r@   r>   r?   r}   r   s           r&   r_   zSwinv2PatchEmbeddings.forwardE  s}    )5););&<~~lFEB__\2
(..1fe#UO''*44Q:
,,,r%   )r   r   r   r   rZ   r   r    r!   r#   r   r   r_   re   rf   s   @r&   rj   rj   &  sF    j	-E$5$5$< 	-u||UZ[^U_G_A` 	-r%   rj   c            	            e Zd ZdZej
                  fdee   dedej                  ddf fdZ	d Z
d	ej                  d
eeef   dej                  fdZ xZS )Swinv2PatchMerginga'  
    Patch Merging Layer.

    Args:
        input_resolution (`tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    input_resolutionr   
norm_layerrH   Nc                     t         |           || _        || _        t	        j
                  d|z  d|z  d      | _         |d|z        | _        y )Nr3   r2   Fbias)rY   rZ   r   r   r   Linear	reductionrv   )r[   r   r   r   r\   s       r&   rZ   zSwinv2PatchMerging.__init__^  sI     01s7AG%@q3w'	r%   c                     |dz  dk(  xs |dz  dk(  }|r.ddd|dz  d|dz  f}t         j                  j                  ||      }|S )Nr2   r   r   )r   r   r   )r[   r;   r>   r?   
should_padr   s         r&   r   zSwinv2PatchMerging.maybe_pade  sU    qjAo:519>
Q519a!<JMM--mZHMr%   r;   input_dimensionsc                    |\  }}|j                   \  }}}|j                  ||||      }| j                  |||      }|d d dd ddd dd d f   }|d d dd ddd dd d f   }	|d d dd ddd dd d f   }
|d d dd ddd dd d f   }t        j                  ||	|
|gd      }|j                  |dd|z        }| j                  |      }| j                  |      }|S )Nr   r2   r   r5   r3   )r7   r8   r   r    r   r   rv   )r[   r;   r   r>   r?   r=   r   r@   input_feature_0input_feature_1input_feature_2input_feature_3s               r&   r_   zSwinv2PatchMerging.forwardm  s   ((5(;(;%
C%**:vulS}feD'14a4Aq(89'14a4Aq(89'14a4Aq(89'14a4Aq(89		?O_Ve"fhjk%**:r1|;KL}5		-0r%   )r   r   r   r   r   ru   r#   r   ModulerZ   r   r    r   r_   re   rf   s   @r&   r   r   Q  sr    
 XZWcWc (s (# (299 (hl (U\\ U3PS8_ Y^YeYe r%   r   c            
            e Zd Zddgf fd	Z	 	 d
dej
                  dej                  dz  dedz  deej
                     fdZ	d	 Z
 xZS )Swinv2SelfAttentionr   c           
         t         |           ||z  dk7  rt        d| d| d      || _        t	        ||z        | _        | j                  | j
                  z  | _        t        |t        j                  j                        r|n||f| _        || _        t        j                  t        j                   dt        j"                  |ddf      z              | _        t        j&                  t        j(                  ddd	
      t        j*                  d	      t        j(                  d|d
            | _        | j/                         \  }}| j1                  d|d       | j1                  d|d       t        j(                  | j                  | j                  |j2                  
      | _        t        j(                  | j                  | j                  d
      | _        t        j(                  | j                  | j                  |j2                  
      | _        t        j:                  |j<                        | _        y )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()
   r   r2   i   Tr   )inplaceFrelative_coords_table)
persistentrelative_position_index) rY   rZ   
ValueErrornum_attention_headsr   attention_head_sizeall_head_sizer   r   r   r   r<   pretrained_window_sizer   ro   r    logoneslogit_scale
Sequentialr   ReLUcontinuous_position_bias_mlpcreate_coords_table_and_indexregister_bufferqkv_biasquerykeyvaluerw   attention_probs_dropout_probry   )	r[   r{   r   	num_headsr<   r   r   r   r\   s	           r&   rZ   zSwinv2SelfAttention.__init__  s   ?a#C5(^_h^iijk  $- #&sY#7 !558P8PP%k;??3K3KLKS^`kRl 	 '=#<<		"uzz9aQRBS7T2T(UV,.MMIIa4("''$*?3PY`eAf-
) :>9[9[9]6646KX]^68O\abYYt1143E3EFOO\
99T//1C1C%PYYt1143E3EFOO\
zz&"E"EFr%   Nr   attention_maskoutput_attentionsrH   c                 R   |j                   \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }	t        j                  j                  |d      t        j                  j                  |d      j                  dd      z  }
t        j                  | j                  t        j                  d            j!                         }|
|z  }
| j#                  | j$                        j                  d| j                        }|| j&                  j                  d         j                  | j(                  d   | j(                  d   z  | j(                  d   | j(                  d   z  d      }|j+                  ddd      j-                         }d	t        j.                  |      z  }|
|j1                  d      z   }
||j                   d   }|
j                  ||z  || j                  ||      |j1                  d      j1                  d      z   }
|
|j1                  d      j1                  d      z   }
|
j                  d| j                  ||      }
t        j                  j3                  |
d      }| j5                  |      }t        j6                  ||	      }|j+                  dddd
      j-                         }|j9                         d d | j:                  fz   }|j                  |      }|r||f}|S |f}|S )Nr5   r   r2   r   g      Y@)maxr      r   )r7   r   r8   r   r   r   r   r   r   r   	normalizer    clampr   mathr   expr   r   r   r<   r9   r:   sigmoidr   softmaxry   matmulr   r   )r[   r   r   r   r=   r   r@   query_layer	key_layervalue_layerattention_scoresr   relative_position_bias_tablerelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputss                      r&   r_   zSwinv2SelfAttention.forward  sx    )6(;(;%
CJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 ==22;B2G"--JaJa2 Kb K

)B
 kk$"2"28LMQQS+k9'+'H'HIcIc'd'i'i(((
$ ">d>Z>Z>_>_`b>c!d!i!iQ$"2"21"55t7G7G7JTM]M]^_M`7`bd"
 "8!?!?1a!H!S!S!U!#emm4J&K!K+.D.N.Nq.QQ%'--a0J/44j(*d6N6NPSUX ((+55a8 9  0.2J2J12M2W2WXY2ZZ/44R9Q9QSVX[\ --//0@b/I ,,7 _kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r%   c                 8   t        j                  | j                  d   dz
   | j                  d   t         j                        j	                         }t        j                  | j                  d   dz
   | j                  d   t         j                        j	                         }t        j
                  t        j                  ||gd            j                  ddd      j                         j                  d      }| j                  d   dkD  rO|d d d d d d dfxx   | j                  d   dz
  z  cc<   |d d d d d d dfxx   | j                  d   dz
  z  cc<   n`| j                  d   dkD  rN|d d d d d d dfxx   | j                  d   dz
  z  cc<   |d d d d d d dfxx   | j                  d   dz
  z  cc<   |dz  }t        j                  |      t        j                  t        j                  |      dz         z  t        j                  d      z  }|j                  t!        | j"                  j%                               j&                        }t        j                  | j                  d         }t        j                  | j                  d         }t        j
                  t        j                  ||gd            }t        j(                  |d      }|d d d d d f   |d d d d d f   z
  }|j                  ddd      j                         }|d d d d dfxx   | j                  d   dz
  z  cc<   |d d d d dfxx   | j                  d   dz
  z  cc<   |d d d d dfxx   d| j                  d   z  dz
  z  cc<   |j+                  d	      }	||	fS )
Nr   r   rK   ij)indexingr2      r   r5   )r    aranger<   int64rc   stackmeshgridr9   r:   r   r   signlog2absr   tonextr   
parametersrK   r   sum)
r[   relative_coords_hrelative_coords_wr   coords_hcoords_wcoordscoords_flattenrelative_coordsr   s
             r&   r   z1Swinv2SelfAttention.create_coords_table_and_index  s?   !LL4+;+;A+>+B)CTEUEUVWEX`e`k`klrrt!LL4+;+;A+>+B)CTEUEUVWEX`e`k`klrrtKK(9;L'MX\]^WQ1Z\Yq\	 	 &&q)A-!!Q1*-1L1LQ1ORS1SS-!!Q1*-1L1LQ1ORS1SS-a 1$!!Q1*-1A1A!1Dq1HH-!!Q1*-1A1A!1Dq1HH-"JJ,-

599EZ;[^a;a0bbeienenopeqq 	 !6 8 8d>_>_>j>j>l9m9s9s t << 0 0 34<< 0 0 34U^^Xx,@4PQvq1(At4~aqj7QQ)11!Q:EEG1a D$4$4Q$7!$;; 1a D$4$4Q$7!$;; 1a A(8(8(;$;a$?? "1"5"5b"9$&===r%   r   )r   r   r   rZ   r    r   r!   r   r#   r_   r   re   rf   s   @r&   r   r     sm    TUWXSY G@ 48).	B||B ))D0B  $;	B
 
u||	BH#>r%   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )Swinv2SelfOutputc                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y rX   )rY   rZ   r   r   denserw   r   ry   r[   r{   r   r\   s      r&   rZ   zSwinv2SelfOutput.__init__  s6    YYsC(
zz&"E"EFr%   r   input_tensorrH   c                 J    | j                  |      }| j                  |      }|S rX   r  ry   )r[   r   r  s      r&   r_   zSwinv2SelfOutput.forward  s$    

=1]3r%   r   r   r   rZ   r    r   r_   re   rf   s   @r&   r  r    s2    G
U\\  RWR^R^ r%   r  c            
            e Zd Zd fd	Z	 	 d	dej
                  dej                  dz  dedz  deej
                     fdZ	 xZ
S )
Swinv2Attentionc           
          t         |           t        ||||t        |t        j
                  j                        r|n||f      | _        t        ||      | _	        y )Nr{   r   r   r<   r   )
rY   rZ   r   r   r   r   r   r[   r  rS   )r[   r{   r   r   r<   r   r\   s         r&   rZ   zSwinv2Attention.__init__  sY    '#0+//2J2JK $:(*@A
	 'vs3r%   Nr   r   r   rH   c                 h    | j                  |||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   )r[   rS   )r[   r   r   r   self_outputsattention_outputr   s          r&   r_   zSwinv2Attention.forward+  sE     yy@QR;;|AF#%QR(88r%   r   r   )r   r   r   rZ   r    r   r!   r   r#   r_   re   rf   s   @r&   r  r    sW    4  48).		||	 ))D0	  $;		
 
u||		r%   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Swinv2Intermediatec                    t         |           t        j                  |t	        |j
                  |z              | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y rX   )rY   rZ   r   r   r   	mlp_ratior  r   
hidden_actrd   r   intermediate_act_fnr  s      r&   rZ   zSwinv2Intermediate.__init__9  sa    YYsC(8(83(>$?@
f''-'-f.?.?'@D$'-'8'8D$r%   r   rH   c                 J    | j                  |      }| j                  |      }|S rX   )r  r*  r^   s     r&   r_   zSwinv2Intermediate.forwardA  s&    

=100?r%   r  rf   s   @r&   r&  r&  8  s#    9U\\ ell r%   r&  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Swinv2Outputc                     t         |           t        j                  t	        |j
                  |z        |      | _        t        j                  |j                        | _	        y rX   )
rY   rZ   r   r   r   r(  r  rw   rx   ry   r  s      r&   rZ   zSwinv2Output.__init__I  sF    YYs6#3#3c#9:C@
zz&"<"<=r%   r   rH   c                 J    | j                  |      }| j                  |      }|S rX   r  r^   s     r&   r_   zSwinv2Output.forwardN  s$    

=1]3r%   r  rf   s   @r&   r-  r-  H  s#    >
U\\ ell r%   r-  c                        e Zd Z	 d fd	Zdeeeef   eeef   f   fdZd Zd Z	 dde	j                  deeef   ded	z  dee	j                  e	j                  f   fd
Z xZS )Swinv2Layerc           
      n   t         	|           || _        | j                  |j                  |j                  f||f      \  }}|d   | _        |d   | _        t        |||| j                  t        |t        j                  j                        r|n||f      | _        t        j                  ||j                        | _        |dkD  rt!        |      nt        j"                         | _        t'        ||      | _        t+        ||      | _        t        j                  ||j                        | _        y )Nr   r   epsrJ   )rY   rZ   r   _compute_window_shiftr<   
shift_sizer  r   r   r   r   	attentionr   ru   layer_norm_epslayernorm_beforerV   IdentityrT   r&  intermediater-  rS   layernorm_after)
r[   r{   r   r   r   drop_path_rater6  r   r<   r\   s
            r&   rZ   zSwinv2Layer.__init__U  s    	 0"&"<"<!3!34z:6N#
Z 'q>$Q-(((0+//2J2JK $:(*@A
 !#Sf6K6K L;IC;O7UWU`U`Ub.vs;"63/!||CV5J5JKr%   rH   c                     t        | j                  |      D cg c]  \  }}t        ||       }}}t        | j                  ||      D cg c]  \  }}}||k  rdn| }}}}||fS c c}}w c c}}}w Nr   )zipr   min)r[   target_window_sizetarget_shift_sizerwr<   sr6  s           r&   r5  z!Swinv2Layer._compute_window_shiftn  sy    -01F1FHZ-[\TQs1ay\\8;D<Q<QS^`q8rssWQ116aq(s
sJ&& ]ss   A*A0c           	         | j                   dkD  rgt        j                  d||df|      }t        d| j                         t        | j                   | j                          t        | j                    d       f}t        d| j                         t        | j                   | j                          t        | j                    d       f}d}|D ]  }|D ]  }	||d d ||	d d f<   |dz  }  t        || j                        }
|
j                  d| j                  | j                  z        }
|
j                  d      |
j                  d      z
  }|j                  |dk7  d      j                  |dk(  d      }|S d }|S )Nr   r   r   r5   r2   g      YrJ   )	r6  r    rp   slicer<   rB   r8   r   masked_fill)r[   r>   r?   rK   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_masks               r&   get_attn_maskzSwinv2Layer.get_attn_masks  s   ??Q{{Avua#8FHa$***+t'''$//)9:t&-M a$***+t'''$//)9:t&-L
 E - #/ K@EHQk1<=QJE
 ,Hd6F6FGL',,R1A1ADDTDT1TUL$..q1L4J4J14MMI!--i1nfEQQR[_`R`befI  Ir%   c                     | j                   || j                   z  z
  | j                   z  }| j                   || j                   z  z
  | j                   z  }ddd|d|f}t        j                  j                  ||      }||fS r?  )r<   r   r   r   )r[   r   r>   r?   	pad_right
pad_bottomr   s          r&   r   zSwinv2Layer.maybe_pad  s    %%0@0@(@@DDTDTT	&&$2B2B)BBdFVFVV
Ay!Z8
))-Dj((r%   r   r   r   Nc                    |\  }}|j                         \  }}}|}	|j                  ||||      }| j                  |||      \  }}
|j                  \  }}}}| j                  dkD  r1t        j                  || j                   | j                   fd      }n|}t        || j                        }|j                  d| j                  | j                  z  |      }| j                  |||j                        }||j                  |j                        }| j                  |||      }|d   }|j                  d| j                  | j                  |      }t        || j                  ||      }| j                  dkD  r/t        j                  || j                  | j                  fd      }n|}|
d   dkD  xs |
d   dkD  }|r|d d d |d |d d f   j                         }|j                  |||z  |      }| j!                  |      }|	| j#                  |      z   }| j%                  |      }| j'                  |      }|| j#                  | j)                  |            z   }|r	||d	   f}|S |f}|S )
Nr   )r   r2   )shiftsdimsr5   r   )r   r   r4   r   )r   r8   r   r7   r6  r    rollrB   r<   rR  rK   r	  rL   r7  rD   r:   r9  rT   r;  rS   r<  )r[   r   r   r   r>   r?   r=   r   channelsshortcutr   
height_pad	width_padshifted_hidden_stateshidden_states_windowsrQ  attention_outputsr#  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputss                          r&   r_   zSwinv2Layer.forward  s    )"/"4"4"6
Ax  &**:vuhO$(NN=&%$P!z&3&9&9#:y!??Q$)JJ}tFVY]YhYhXhEipv$w!$1! !11FHXHX Y 5 : :2t?O?ORVRbRb?bdl m&&z9MDWDW&X	 !%:%A%ABI NN+@)_pNq,Q/,11"d6F6FHXHXZbc():D<L<LjZcd ??Q %

?DOOUYUdUdCelr s /]Q&;*Q-!*;
 1!WfWfufa2G H S S U-22:v~xX--.?@ 4>>-#@@((7{{<0$t~~d6J6J<6X'YY@Q'8';< YeWfr%   )rJ   r   r   r   )r   r   r   rZ   r#   r   r5  rR  r   r    r   r   r_   re   rf   s   @r&   r1  r1  T  s    qrL2'eTYZ]_bZbTcejknpsksetTtNu '
8) */	5||5  S/5  $;	5
 
u||U\\)	*5r%   r1  c            
       |     e Zd Z	 d fd	Z	 d	dej
                  deeef   dedz  deej
                     fdZ	 xZ
S )
Swinv2Stagec	           
      |   t         |           || _        || _        g }	t	        |      D ]?  }
t        ||||||
   |
dz  dk(  rdn|j                  dz  |      }|	j                  |       A t        j                  |	      | _
        |& |||t        j                        | _        d| _        y d | _        d| _        y )Nr2   r   )r{   r   r   r   r=  r6  r   )r   r   F)rY   rZ   r{   r   ranger1  r<   appendr   
ModuleListblocksru   
downsamplepointing)r[   r{   r   r   depthr   rT   rm  r   rl  iblockr\   s               r&   rZ   zSwinv2Stage.__init__  s     	u 
	!A!1#(|!"Q!1&2D2D2I'=E MM% 
	! mmF+ !()9sr||\DO  #DOr%   r   r   r   NrH   c                     |\  }}t        | j                        D ]  \  }} ||||      }|d   } |}	| j                  )|dz   dz  |dz   dz  }}
|||
|f}| j                  |	|      }n||||f}||	|f}|r|dd  z  }|S )Nr   r   r2   )	enumeraterl  rm  )r[   r   r   r   r>   r?   rp  layer_modulere  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledr   stage_outputss                 r&   r_   zSwinv2Stage.forward  s     )(5 	-OA|( !M *!,M	- -:)??&5;aZA4EPQ	VWGW 1!'0BDU V OO,MO_`M!' >&(IK\]]12..Mr%   r$  r   )r   r   r   rZ   r    r   r#   r   r   r_   re   rf   s   @r&   rg  rg    sU    mn@ */	||  S/  $;	
 
u||	r%   rg  c                        e Zd Zd fd	Z	 	 	 	 ddej
                  deeef   dedz  dedz  dedz  dedz  d	ee	z  fd
Z
 xZS )Swinv2Encoderc                 >   t         	|           t        |j                        | _        || _        | j
                  j                  |j                  }t        j                  d|j                  t        |j                        d      D cg c]  }|j                          }}g }t        | j                        D ]  }t        |t        |j                  d|z  z        |d   d|z  z  |d   d|z  z  f|j                  |   |j                   |   |t        |j                  d |       t        |j                  d |dz           || j                  dz
  k  rt"        nd ||         }|j%                  |        t'        j(                  |      | _        d| _        y c c}w )Nr   cpu)rL   r2   r   )r{   r   r   ro  r   rT   rm  r   F)rY   rZ   lendepths
num_layersr{   pretrained_window_sizesr    linspacer=  r  itemri  rg  r   rq   r   r   rj  r   rk  layersgradient_checkpointing)
r[   r{   rm   r  xdprr  i_layerstager\   s
            r&   rZ   zSwinv2Encoder.__init__  sw   fmm,;;..:&,&D&D#!&63H3H#fmmJ\ej!klAqvvxllT__- 	!G((1g:56"+A,1g:">	!QRT[Q[@\!]mmG, **73c&--"9:S}QX[\Q\A]=^_29DOOa<O2O-VZ'>w'G	E MM% 	! mmF+&+## ms   Fr   r   r   Noutput_hidden_states(output_hidden_states_before_downsamplingreturn_dictrH   c                    |rdnd }|rdnd }|rdnd }	|rE|j                   \  }
}} |j                  |
g|| }|j                  dddd      }||fz  }||fz  }t        | j                        D ]  \  }} ||||      }|d   }|d   }|d   }|d   |d   f}|rP|rN|j                   \  }
}} |j                  |
g|d   |d   f| }|j                  dddd      }||fz  }||fz  }nI|rG|sE|j                   \  }
}} |j                  |
g|| }|j                  dddd      }||fz  }||fz  }|s|	|dd  z  }	 |st        d |||	|fD              S t        |||	|	      S )
Nr$   r   r   r   r2   r   r5   c              3   $   K   | ]  }|| 
 y wrX   r$   ).0vs     r&   	<genexpr>z(Swinv2Encoder.forward.<locals>.<genexpr>\  s      = s   )r   r   r   r   )r7   r8   r9   rs  r  r#   r   )r[   r   r   r   r  r  r  all_hidden_statesall_reshaped_hidden_statesall_self_attentionsr=   r   r   reshaped_hidden_staterp  rt  re  ru  r   s                      r&   r_   zSwinv2Encoder.forward$  sB    #7BD+?RT"$5b4)6)<)<&J;$6M$6$6z$bDT$bVa$b!$9$A$A!Q1$M!-!11&+@*BB&(5  	9OA|( !M *!,M0=a0@- -a 0 1" 57H7LM#(P-N-T-T*
A{ )O(I(N(N)"3A"68I!8L!M)OZ)% )>(E(EaAq(Q%!&G%II!*/D.FF*%.V-:-@-@*
A{(:(:(::(fHX(fZe(f%(=(E(EaAq(Q%!m%55!*/D.FF* #}QR'88#A 	9D  '):<OQkl   #++*#=	
 	
r%   ))r   r   r   r   )FFFT)r   r   r   rZ   r    r   r#   r   r   r   r_   re   rf   s   @r&   rz  rz  
  s    ,: */,1@E#'C
||C
  S/C
  $;	C

 #TkC
 37+C
 D[C
 
$	$C
r%   rz  c                   \    e Zd ZU eed<   dZdZdZdZdgZ	 e
j                         d        Zy)	Swinv2PreTrainedModelr{   swinv2r   )imageTrg  c                    t        |t        j                  t        j                  f      rct	        j
                  |j                  d| j                  j                         |j                   t	        j                  |j                         yyt        |t        j                        r?t	        j                  |j                         t	        j                  |j                         yt        |t              rX|j                  t	        j                  |j                         |j                   t	        j                  |j                         yyt        |t               rt	        j"                  |j$                  t'        j(                  d             |j+                         \  }}t	        j,                  |j.                  |       t	        j,                  |j0                  |       yy)zInitialize the weightsrJ   )meanstdNr   )r   r   r   r   initnormal_weightr{   initializer_ranger   zeros_ru   ones_rh   rr   rt   r   	constant_r   r   r   r   copy_r   r   )r[   moduler   r   s       r&   _init_weightsz#Swinv2PreTrainedModel._init_weightss  s=    fryy"))45LLSdkk6S6ST{{&FKK( '-KK$JJv}}% 01  ,F--.))5F667 6 34NN6--txx|<=C=a=a=c:!#:JJv335JKJJv557NO	 5r%   N)r   r   r   r   r"   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modulesr    no_gradr  r$   r%   r&   r  r  j  sC     $O!&*#&U]]_P Pr%   r  c                        e Zd Zd fd	Zd Ze	 	 	 	 	 	 ddej                  dz  dej                  dz  de	dz  de	dz  de	d	e	dz  d
e
ez  fd       Z xZS )Swinv2Modelc                    t         |   |       || _        t        |j                        | _        t        |j                  d| j
                  dz
  z  z        | _        t        ||      | _
        t        || j                  j                        | _        t        j                  | j                  |j                         | _        |rt        j$                  d      nd| _        | j)                          y)a  
        add_pooling_layer (`bool`, *optional*, defaults to `True`):
            Whether or not to apply pooling layer.
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether or not to create and apply mask tokens in the embedding layer.
        r2   r   )r|   r3  N)rY   rZ   r{   r}  r~  r  r   rq   num_featuresrh   r}   rz  rn   encoderr   ru   r8  	layernormAdaptiveAvgPool1dpooler	post_init)r[   r{   add_pooling_layerr|   r\   s       r&   rZ   zSwinv2Model.__init__  s     	 fmm, 0 0119L3M MN*6.Q$VT__-G-GHd&7&7V=R=RS1Bb**1- 	r%   c                 .    | j                   j                  S rX   r}   rk   ra   s    r&   get_input_embeddingsz Swinv2Model.get_input_embeddings      ///r%   Nr   r   r   r  r   r  rH   c                 @   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  |||      \  }}	| j                  ||	|||      }
|
d   }| j                  |      }d}| j                  7| j                  |j                  dd            }t        j                  |d      }|s||f|
dd z   }|S t        |||
j                  |
j                  |
j                        S )	z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)r   r   )r   r  r  r   r   r2   )r   r)   r   r   r   )r{   r   r  r  r   r}   r  r  r  r   r    r   r(   r   r   r   )r[   r   r   r   r  r   r  kwargsembedding_outputr   encoder_outputssequence_outputpooled_outputrS   s                 r&   r_   zSwinv2Model.forward  sJ    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY?@@-1__/Tl .= .
** ,,/!5# ' 
 *!,..9;;" KK(A(A!Q(GHM!MM-;M%}58KKFM -')77&11#2#I#I
 	
r%   )TFNNNNFN)r   r   r   rZ   r  r   r    r!   r   r   r#   r(   r_   re   rf   s   @r&   r  r    s    *0  2637)-,0).#'6
''$.6
 ))D06
  $;	6

 #Tk6
 #'6
 D[6
 
"	"6
 6
r%   r  a~  
        Swinv2 Model with a decoder on top for masked image modeling, as proposed in
    [SimMIM](https://huggingface.co/papers/2111.09886).

        <Tip>

        Note that we provide a script to pre-train this model on custom data in our [examples
        directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

        </Tip>
    c                        e Zd Z fdZe	 	 	 	 	 	 ddej                  dz  dej                  dz  dedz  dedz  dededz  d	e	e
z  fd
       Z xZS )Swinv2ForMaskedImageModelingc                    t         |   |       t        |dd      | _        t	        |j
                  d|j                  dz
  z  z        }t        j                  t        j                  ||j                  dz  |j                  z  d      t        j                  |j                              | _        | j                          y )NFT)r  r|   r2   r   )in_channelsout_channelsr   )rY   rZ   r  r  r   rq   r  r   r   r   encoder_strider@   PixelShuffledecoderr  )r[   r{   r  r\   s      r&   rZ   z%Swinv2ForMaskedImageModeling.__init__  s     !&ERVW6++aF4E4E4I.JJK}}II(v7L7La7ORXReRe7est OOF112	
 	r%   Nr   r   r   r  r   r  rH   c                    ||n| j                   j                  }| j                  ||||||      }|d   }	|	j                  dd      }	|	j                  \  }
}}t        j                  |dz        x}}|	j                  |
|||      }	| j                  |	      }d}|| j                   j                  | j                   j                  z  }|j                  d||      }|j                  | j                   j                  d      j                  | j                   j                  d      j                  d      j                         }t        j                  j!                  ||d	      }||z  j#                         |j#                         d
z   z  | j                   j$                  z  }|s|f|dd z   }||f|z   S |S t'        |||j(                  |j*                  |j,                        S )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, Swinv2ForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
        >>> model = Swinv2ForMaskedImageModeling.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 256, 256]
        ```N)r   r   r  r   r  r   r   r2   r   r5   none)r   gh㈵>)r,   r-   r   r   r   )r{   r  r  r   r7   r   floorr   r  r   rz   repeat_interleaver   r:   r   r   l1_lossr  r@   r+   r   r   r   )r[   r   r   r   r  r   r  r  r   r  r=   r@   sequence_lengthr>   r?   reconstructed_pixel_valuesmasked_im_lossr   r   reconstruction_lossrS   s                        r&   r_   z$Swinv2ForMaskedImageModeling.forward  s   P &1%<k$++BYBY+++/!5%=#  
 "!*)33Aq94C4I4I1
L/OS$899)11*lFTYZ &*\\/%B"&;;))T[[-C-CCD-55b$EO11$++2H2H!L""4;;#9#91=1	  #%--"7"7F`lr"7"s1D8==?488:PTCTUX\XcXcXpXppN02WQR[@F3A3M^%.YSYY.5!//))#*#A#A
 	
r%   r  )r   r   r   rZ   r   r    r!   r   r   r#   r+   r_   re   rf   s   @r&   r  r    s       2637)-,0).#'S
''$.S
 ))D0S
  $;	S

 #TkS
 #'S
 D[S
 
0	0S
 S
r%   r  a  
    Swinv2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune SwinV2 on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                        e Zd Z fdZe	 	 	 	 	 	 ddej                  dz  dej                  dz  dedz  dedz  dededz  d	e	e
z  fd
       Z xZS )Swinv2ForImageClassificationc                 >   t         |   |       |j                  | _        t        |      | _        |j                  dkD  r4t        j                  | j                  j                  |j                        nt        j                         | _	        | j                          y r?  )rY   rZ   
num_labelsr  r  r   r   r  r:  
classifierr  )r[   r{   r\   s     r&   rZ   z%Swinv2ForImageClassification.__init__d  sx      ++!&) GMFWFWZ[F[BIIdkk..0A0ABacalalan 	
 	r%   Nr   labelsr   r  r   r  rH   c                 V   ||n| j                   j                  }| j                  |||||      }|d   }	| j                  |	      }
d}|| j	                  ||
| j                         }|s|
f|dd z   }||f|z   S |S t        ||
|j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r  r   r  r   r2   )r,   r0   r   r   r   )	r{   r  r  r  loss_functionr/   r   r   r   )r[   r   r  r   r  r   r  r  r   r  r0   r,   rS   s                r&   r_   z$Swinv2ForImageClassification.forwardr  s    " &1%<k$++BYBY++/!5%=#  
  
/%%ffdkkBDY,F)-)9TGf$EvE*!//))#*#A#A
 	
r%   r  )r   r   r   rZ   r   r    r!   
LongTensorr   r#   r/   r_   re   rf   s   @r&   r  r  T  s       26*.)-,0).#',
''$.,
   4',
  $;	,

 #Tk,
 #',
 D[,
 
,	,,
 ,
r%   r  zO
    Swinv2 backbone, to be used with frameworks like DETR and MaskFormer.
    c                   x     e Zd Z fdZd Zeee	 	 	 d
dede	dz  de	dz  de	dz  de
f
d	                     Z xZS )Swinv2Backbonec           	      h   t         |   |       |j                  gt        t	        |j
                              D cg c]  }t        |j                  d|z  z         c}z   | _        t        |      | _	        t        || j                  j                        | _        | j                          y c c}w )Nr2   )rY   rZ   rq   ri  r}  r~  r   r  rh   r}   rz  rn   r  r  )r[   r{   rp  r\   s      r&   rZ   zSwinv2Backbone.__init__  s     #--.X]^abhbobo^pXq1rST#f6F6FA6M2N1rr*62$VT__-G-GH 	 2ss   "B/c                 .    | j                   j                  S rX   r  ra   s    r&   r  z#Swinv2Backbone.get_input_embeddings  r  r%   Nr   r   r  r  rH   c                 
   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |      \  }}| j                  |||dd|      }|r|j                  n|d   }	d}
t        | j                  |	      D ]  \  }}|| j                  v s|
|fz  }
 |s|
f}|r	||d   fz  }|r	||d   fz  }|S t        |
|r|j                  nd|j                        S )	a  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
        >>> model = AutoBackbone.from_pretrained(
        ...     "microsoft/swinv2-tiny-patch4-window8-256", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 2048, 7, 7]
        ```NT)r   r  r  r  r5   r$   r   r2   )feature_mapsr   r   )r{   r  r  r   r}   r  r   r@  stage_namesout_featuresr   r   r   )r[   r   r   r  r  r  r  r   r   r   r  r  hidden_staterS   s                 r&   r_   zSwinv2Backbone.forward  sA   J &1%<k$++BYBY$8$D $++JjJj 	 2C1N-TXT_T_TqTq-1__\-J**,,/!%59#  
 ;F667SU;#&t'7'7#G 	0E<)))/	0 "_F#71:-' 71:-'M%3G'//T))
 	
r%   )NNN)r   r   r   rZ   r  r   r
   r   r   r   r   r_   re   rf   s   @r&   r  r    s    0   *.,0#'F
F
  $;F
 #Tk	F

 D[F
 
F
  ! F
r%   r  )r  r  r  r  r  )rJ   F)?r   collections.abcr   r   dataclassesr   r    r   r    r   r  activationsr   backbone_utilsr	   r
   modeling_layersr   modeling_outputsr   modeling_utilsr   utilsr   r   r   r   utils.genericr   configuration_swinv2r   
get_loggerr   loggerr   r(   r+   r/   rB   rD   rc   r   rT   r   rV   rh   rj   r   r   r  r  r&  r-  r1  rg  rz  r  r  r  r  r  __all__r$   r%   r&   <module>r     s   (   !   & ! H 9 . - D D - . 
		H	% H+ H H  H H H& Hk H H* H+ H H,	U\\ e T V[VbVb  %RYY %Y-ryy Y-z(-BII (-V3 3lE>")) E>R
ryy 
bii 6  	299 	w")) wt9, 9x]
BII ]
@ PO P P< P
' P
 P
f 
e
#8 e
e
P <
#8 <
<
~ 
W
]$9 W

W
tr%   