
    i                        d dl Z d dlmZ d dlmZ d dlZd dlmZ ddlmZ	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 ddl8m9Z9 dZ: e jv                  e<      Z= G d dej                  j|                        Z? G d de6      Z@ G d de(      ZA G d d e/      ZB G d! d"ej|                        ZC G d# d$ej|                        ZD G d% d&e0      ZE G d' d(e4      ZF G d) d*e3      ZGe G d+ d,e             ZH G d- d.e5eH      ZI G d/ d0e1      ZJ G d1 d2e2      ZKg d3ZLy)4    N)Callable)cycle)nn   )initialization)ACT2FN)CacheDynamicCache)lazy_load_kernel)create_causal_mask)BaseModelOutputWithPast SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilinglogging)merge_with_config_defaults)resolve_internal_import)capture_outputs   )LlamaRotaryEmbeddingapply_rotary_pos_emb)pad_tensor_by_sizereshape_into_chunkssegment_sum)	ZambaAttentionZambaAttentionDecoderLayerZambaForCausalLMZambaForSequenceClassificationZambaHybridLayerZambaMambaDecoderLayer
ZambaModelZambaRMSNormeager_attention_forward   )Zamba2ConfigzZyphra/Zamba2-2.7Bc                   (     e Zd Zd fd	ZddZ xZS )Zamba2RMSNormGatedc                     t         |           t        j                  t	        j
                  |            | _        || _        || _        y N)	super__init__r   	Parametertorchonesweightvariance_epsilon
group_size)selfhidden_sizer6   eps	__class__s       z/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/zamba2/modular_zamba2.pyr0   zZamba2RMSNormGated.__init__8   s6    ll5::k#:; #$    c                 b   |j                   }|j                  t        j                        }|?|t        j
                  j                  |j                  t        j                              z  }|j                  ^ }}|| j                  z  } |j                  g ||| j                   }|j                  d      j                  dd      }|t        j                  || j                  z         z  } |j                  g ||| j                  z   }| j                  |j                  |      z  S )Nr   T)keepdim)dtypetor2   float32r   
functionalsilushaper6   viewpowmeanrsqrtr5   r4   )	r7   hidden_statesgateinput_dtypeprefix_dimslast_dimgroup_counthidden_states_groupvariances	            r;   forwardzZamba2RMSNormGated.forward>   s   #))%((7)BMM,>,>twwu}}?U,VVM!.!4!4h$//10m00\+\{\DOO\&**1-222t2D1EKK4K`K`@`4aa0+00]+]{T__?\]{{]--k:::r<   )gư>r.   )__name__
__module____qualname__r0   rR   __classcell__r:   s   @r;   r,   r,   7   s    %;r<   r,   c                       e Zd Zy)Zamba2RMSNormNrS   rT   rU    r<   r;   rY   rY   L       r<   rY   c                       e Zd Zy)Zamba2RotaryEmbeddingNrZ   r[   r<   r;   r^   r^   P   r\   r<   r^   c                   F    e Zd ZdZ	 	 	 ddededz  dedz  dedz  f fdZ	 	 	 ddej                  ded	ej                  dz  d
e	dz  de
ej                  ej                  f   dz  dee   de
ej                  ej                  dz  e
ej                     dz  f   fdZ xZS )Zamba2AttentionaZ  
    Multi-headed attention from 'Attention Is All You Need' paper.

    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
    (see fig. 2 in https://huggingface.co/papers/2405.16712).
    Additionally, replaced
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
    Finally, this attention layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this
    layer is tied, un-tied adapters (formally the same as LoRA but used in the base model) modules are added to the q, k, v projectors to increase
    expressivity with a small memory overhead (see Fig. 2 of https://huggingface.co/papers/2411.15242).
    Nconfig	layer_idxnum_fwd_mem_blocksblock_idc           	         t         |   ||       || _        |j                  | _        || _        |j                  rt        j                  g       | _	        t        j                  g       | _
        t        j                  g       | _        t        | j                        D ]  }||j                  z  |k(  r{t        j                  t        j                  | j                   | j"                  j$                  d      t        j                  | j"                  j$                  | j                   d            }t        j                  t        j                  | j                   | j"                  j$                  d      t        j                  | j"                  j$                  | j                   d            }t        j                  t        j                  | j                   | j"                  j$                  d      t        j                  | j"                  j$                  | j                   d            }n<t        j&                         }t        j&                         }t        j&                         }| j                  j)                  |       | j                  j)                  |       | j                  j)                  |       ! t+        | j                        D 	
ci c]  \  }	}
|
|	
 c}
}	| _        y c c}
}	w )NFbias)r/   r0   rc   hybrid_layer_idslayer_block_maprd   use_shared_attention_adapterr   
ModuleListlinear_q_adapter_listlinear_k_adapter_listlinear_v_adapter_listrangenum_mem_blocks
SequentialLinearattention_hidden_sizera   adapter_rankIdentityappend	enumerate	layer_dic)r7   ra   rb   rc   rd   ilinear_q_adapterlinear_k_adapterlinear_v_adapterindexvaluer:   s              r;   r0   zZamba2Attention.__init__d   s#    	+"4%66 ..)+r):D&)+r):D&)+r):D&4223 Dv,,,8')}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($ (*}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($ (*}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($
 (*{{}$'){{}$'){{}$**112BC**112BC**112BC)D, <ETEYEY;Z[<5%%,[[s   K2rJ   attention_maskpast_key_valuesposition_embeddingskwargsreturnc                    |j                   d d }g |d| j                  }| j                  |      }	| j                  |      }
| j	                  |      }| j
                  j                  rW| j                  |   }|	 | j                  |   |      z   }	|
 | j                  |   |      z   }
| | j                  |   |      z   }|	j                  |      j                  dd      }	|
j                  |      j                  dd      }
|j                  |      j                  dd      }| j
                  j                  r|\  }}t        |	|
||      \  }	}
||j                  |
||      \  }
}t!        j"                  | j
                  j$                  t&              } || |	|
||f| j(                  sdn| j*                  | j,                  d|\  }} |j.                  g |d j1                         }| j3                  |      }||fS )Nr>   r)   r   g        )dropoutscaling)rE   head_dimq_projk_projv_projra   rj   rx   rl   rm   rn   rF   	transposeuse_mem_roper   updater   get_interface_attn_implementationr(   trainingattention_dropoutr   reshape
contiguouso_proj)r7   rJ   rb   r   r   r   r   input_shapehidden_shapequery_states
key_statesvalue_statesadapter_layer_idxcossinattention_interfaceattn_outputattn_weightss                     r;   rR   zZamba2Attention.forward   s    $))#2.88b8$--8{{=1[[/
{{=1;;33 $y 9'*W$*D*DEV*WXe*ffL#&Sd&@&@AR&STa&bbJ'*W$*D*DEV*WXe*ffL#((6@@AF__\2<<QB
#((6@@AF;;##*HC';L*VY[^'_$L*&'6'='=j,Xa'b$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r<   NNN)rS   rT   rU   __doc__r*   intr0   r2   Tensorr	   tupler   r   rR   rV   rW   s   @r;   r`   r`   T   s    $ !%)-#'\'\ :'\  $J	'\
 *'\Z /3(,HL1)||1) 1) t+	1)
 1) #5<<#=>E1) +,1) 
u||U\\D0%2E2LL	M1)r<   r`   c                        e Zd ZdZddededz  f fdZ	 	 ddej                  de	dz  dej                  dz  fd	Z
dde	dz  dej                  dz  fd
Z	 	 dde	dz  dej                  dz  fdZ xZS )Zamba2MambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    Nra   rb   c           	         t         |           || _        |j                  | _        |j                  | _        |j                  | _        t        |j                  | j                  z        | _
        || _        |j                  | _        d| _        t        j                         | _        |j"                  | _        |j$                  | _        |j(                  | _        | j                  j,                  | _        |j0                  | _        |j2                  | _        |j4                  | _        |j6                  | _        | j                  d| j&                  z  | j
                  z  z   | _        t        j:                  | j8                  | j8                  d|j                  | j8                  |j                  dz
        | _        | j                  | j8                  z   | j.                  z   }t        j>                  | j                  ||j@                        | _!        t        jD                  tG        jH                  | j.                              | _%        tG        jL                  d| j.                  dz         }t        jD                  tG        jN                  |            | _(        tS        | j                  | j                  | j&                  z  d      | _*        t        jD                  tG        jH                  | j.                              | _+        t        j>                  | j                  | j                  |j@                        | _,        t[        d	      }t]        |d
d       a/t]        |dd       a0t[        d      }tc        |d      a2tc        |d      a3tc        |d      a4tk        td        tf        th        t`        t^        f      a6tl        stn        jq                  d       y y )NrD   r   Tr)   )in_channelsout_channelsrg   kernel_sizegroupspaddingrf   gh㈵>)r6   r9   zcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathz1ops.triton.ssd_combined.mamba_chunk_scan_combinedz8ops.triton.ssd_combined.mamba_split_conv1d_scan_combineda  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d)9r/   r0   ra   r8   mamba_d_statessm_state_sizemamba_d_convconv_kernel_sizer   mamba_expandintermediate_sizerb   use_conv_bias
activationr   SiLUactuse_mem_eff_pathmamba_ngroupsn_groupsmamba_headdimr   n_mamba_heads	num_heads
chunk_sizetime_step_limittime_step_mintime_step_maxconv_dimConv1dconv1drr   add_bias_linearin_projr1   r2   r3   dt_biasarangelogA_logr,   normDout_projr   getattrr   r   r   selective_state_updatemamba_chunk_scan_combined mamba_split_conv1d_scan_combinedallis_fast_path_availableloggerwarning_once)r7   ra   rb   projection_sizeAcausal_conv1d	mamba_ssmr:   s          r;   r0   zZamba2MambaMixer.__init__   s   !--$22 & 3 3!$V%8%84;K;K%K!L"#11 779 & 7 7,,,,22 ++%55#11#11..T]]1BTEXEX1XXii++==''!+
 004==@4>>Qyy''
 ||EJJt~~$>? LLDNNQ./\\%))A,/
&""t/E/E/V\`
	 ejj89		$"8"8$:J:JQWQgQgh )9&}6LdS"=2DdK %[1	!8$^"
 %<$W%
! ,C$^,
(
 "%&)0 $"
 &> &r<   rJ   cache_paramsr   c                    |j                   \  }}}| j                  | j                  z  }d| j                  z  d| j                  z  | j                  z  z   | j                  z   }|W|j                  | j                        r;| j                  |j                  d            }	|	j                   d   |z
  dz  }
|
|
| j                  | j                  | j                  g}t        j                  |	|d      \  }}}}}t        ||j                  | j                     j                  | j                  j                   j                  d      | j                  j"                  | j$                        }t        j                  || j                  ||gd      \  }}}t        j&                  | j(                  j+                                }|d d d df   d d d d d f   j-                  d| j.                  | j                        j1                  t        j2                        }|d d d d d f   j-                  dd| j.                        }| j4                  d d d df   j-                  d| j.                        }| j6                  d d d df   j-                  d| j.                        }|j9                  || j                  |j                   d   | j                  z        }|j9                  || j                  |j                   d   | j                  z        }|j9                  || j                  | j.                        }t;        |j                  | j                     j<                  ||||||d |d
      }|j9                  || j                  | j.                  z        }| j?                  ||      }| jA                  |      d d d df   }|S |Bt        jB                  |dk(        s*|jD                  }||d d d d d f   z  j1                  |      }| j                  |      }t        j&                  | j(                  j+                                }| jF                  i nd	| jF                  i}|t        jB                  |dk(        }nd}| jH                  r| jJ                  r||rtM        || j                  j                   j                  d      | j                  j"                  | j4                  |f| j6                  | jN                  d | j$                  | j>                  j                   | j>                  jP                  | j@                  j                   | j@                  j"                  | j.                  | j                  d
dd|\  }}|S t        j                  || j                  | j                  | j                  gd      \  }}}|j|jS                  dd      }tT        jV                  jY                  || jZ                  |j                   d   z
  df      }|j]                  || j                        }t^        | j$                  dvrJ| ja                  | j                  |jS                  dd            jS                  dd      d d d |f         }nyt_        |jS                  dd      | j                  j                   j                  d      | j                  j"                  | j$                        jS                  dd      d d d |f   }t        j                  || j                  ||gd      \  }}}|Bt        jB                  |dk(        s*|jD                  }||d d d d d f   z  j1                  |      }tc        |j9                  ||d| j.                        |||j9                  ||| j                  d      |j9                  ||| j                  d      f| jN                  | j6                  d d d| j4                  dd|\  }}|||je                  || j                         |j9                  ||d      }| j?                  ||      }| jA                  |      }|S )Nr   r)   r>   dim.r@   T)zr   dt_softplusdt_limitF)r   r   seq_idxr   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr   )rD   swish)xr4   rg   r   )r   r   r   r   r   r   r   )3rE   r   r   r   r   has_previous_staterb   r   squeezer   r2   splitr   layersconv_statesr   r4   rg   r   expr   floatexpandr   rA   rB   r   r   rF   r   recurrent_statesr   r   r   r@   r   r   r   r   r   r5   r   r   rC   padr   update_conv_stater   r   r   update_recurrent_state)r7   rJ   r   r   
batch_sizeseq_len_groups_time_state_sized_to_removein_projected_statesd_mlpsplit_projection_dimrK   hidden_states_B_CdtBCr   r   r   hidden_states_reshapedoutr@   projected_statesdt_limit_kwargsinput_not_masked	ssm_state	time_stephidden_states_B_C_t
conv_statescan_outputs                                  r;   cuda_kernels_forwardz%Zamba2MambaMixer.cuda_kernels_forward$  s{    "/!4!4
GQ!%1D1D!D$0001t}}3DtGZGZ3ZZ]a]k]kk #(G(G(W"&,,}/D/DQ/G"H(..r2[@QFE$)5$2H2H$--Y]YgYg#h 05<OQekm0n-Aq$)2 4!##DNN3??""**1-  ! #(++!'')?AWX#M1a
 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az4==!''!*2MNAz4==!''!*2MNA%2%7%7
DNNTXTaTa%b"2##DNN3DD& M *..z4>>DMM;YZM IImT:M--.q$|<Cz 
u )%))Na<O2P%++!.1d
1K!K O OPU V#||M:4::++-..A$($8$8$@bzSWSgSgFhO)#(99^q-@#A #' $$<;OTd!A$KK&&..q1KK$$LL" ff# ##'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(,#"$ &%"YX 
m 6;[[$++T]]DNNK62'  +*;*E*Ea*K'!#!2!2+d.C.CFYF_F_`bFc.cef-g"J ".!?!?
DNN![J#+tFW/W(,$5$?$?1$EFPPQRTUVWXZb[bZbWbc)% )9+55a;#{{1199!<![[--#'??	)
  i1oa'k)3% ',kk%++-CE[\'#q!
 "-eiiRS@S6T)//E%2^Aq$J5O%O$S$STY$ZM)B!&&z7BNFF:wrBFF:wrB*  $ff (, LL $* &*&Y (\-E 77	4>>R)..z7BG"iiT:mmK0
r<   c                 d   |j                   \  }}}|j                  }|-|j                  | j                        r| j	                  |      }n1|||d d d d d f   z  j                  |      }| j	                  |      }|j                   d   d| j                  z  z
  d| j                  z  | j                  z  z
  | j                  z
  dz  }	|j                  |	|	| j                  | j                  | j                  gd      \  }}}
}}|j                  dd      }|d uxr |j                  | j                        }|r|j                  || j                        }t        j                  || j                   j"                  d d dd d f   z  d      }| j$                  r|| j                   j&                  z  }| j)                  |      j                  |      d d d df   }n|Xt*        j,                  j/                  || j0                  |j                   d   z
  df      }|j                  || j                        }| j)                  | j!                  |      dd |f   j                  dd            }|*|j                  }||d d d d d f   z  j                  |      }t        j                  || j                  | j                  | j                  z  | j                  | j                  z  gd      \  }}}t        j2                  | j4                  j7                                }|r`|j8                  dk(  r
|d d d df   n|d d dd d f   d d d df   }|j                  dd      j;                  ||j                   d   | j<                        }| j>                  d   j;                  | j>                  j                   d   | j<                        }t        j*                  j,                  jA                  ||j                  |j                        z         }t        jB                  || jD                        }|d   j;                  | j                  | j<                  | j                        j                  t        jF                  	      }t        j2                  |d   |z        }|jI                  || j                  d      dd d d f   }|j;                  || j                  | j                  | j                  z  |j                   d         jK                         }|jI                  |d|j                   d         }|d   |dd d d f   z  }|jI                  |d| j<                        }||d   z  }|jL                  | j                     jN                  jQ                         }||z  |z   }|jS                  || j                        }|jI                  || j                  d      dd d d f   }|j;                  || j                  | j                  | j                  z  |j                   d         jK                         }|jI                  |d|j                   d         }|j                  |j                        }|jU                  || j                  z  | j<                  | j                        }|jU                  || j                  z  | j                  d      }t        jV                  ||      }|jU                  || j                  | j<                        }| jX                  d   j;                  | jX                  j                   d   | j<                        }|||z  z   j                  |j                        }|jI                  |d      d d d df   }nt*        j,                  jA                  || j>                  z         }t        jB                  || jD                        }|jI                  ||d| j<                        j7                         }|jI                  ||d| j                        j7                         }|jI                  ||d| j                        j7                         }|j[                  | j                  | j                  z  d| j                  
      }|j[                  | j                  | j                  z  d| j                  
      }| j\                  || j\                  z  z
  | j\                  z  }| jX                  d   t_        ||      z  }||d   z  }|j                  |j                        |z  }||||fD cg c]  }ta        ||| j\                         c}\  }}}}|jc                  dddd      }t        jd                  |d      }t        j2                  tg        |            }|d d d d d d d d d d d f   |d d d d d d d d d d d f   z  } | j                  d      }!|!d   |jc                  ddddd      d   z  }"|"j                  d      }#|#d   |d d d d d f   z  j                  d      }$t        j2                  |d d d d d d dd f   |z
        }%||%jc                  dddd      d   z  }&|&jc                  ddddd      d   |jc                  ddddd      dd d d f   z  j                  d      jc                  ddddd      }'t        jh                  |'d d d df         }(t        jj                  |(|'gd      }'t        j2                  tg        t*        j,                  j/                  |d d d d d d df   d                  })|'jc                  ddddd      }*|)d   |*d d d d d df   z  j                  d      }+|+jc                  ddddd      },|,d d d df   |,d d df   }-}'t        j2                  |      }.|dd d d f   |'d d d d d df   z  }/|.jc                  dddd      }0|/j                  d      |0d   z  }1|$|1z   }|jI                  |d| j                  | j<                        }||z   }|dkD  r|d d d |d d d d f   }|jI                  ||d      }|-||jS                  |-| j                         | jm                  ||
      }2| jo                  |2j                  |            }3|3S c c}w )Nr>   r   r   r)   r   .).N).NNr   )r   output_sizer      )r)   r   )8rE   r@   r   rb   r   rA   r   r   r   r   r   r   r   r   r2   sumr   r4   r   rg   r   r   rC   r   r   r   r   r   ndimr   r   r   softplusclampr   rB   r   r   r   r   cloner   rF   bmmr   repeat_interleaver   r   r   permutecumsumr   
zeros_likecatr   r   )4r7   input_statesr   r   r   r   r   r@   r	  r  rK   rJ   r  use_precomputed_stater  r  r  r   r   dAdBdBx
ssm_statesssm_states_reshaped
C_reshapedyr   pad_size
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decay_contractionstatesprevious_statesdecay_chunkstates_permutedresult
new_statesr  state_decay_outC_times_statesstate_decay_out_permutedY_offr  contextualized_statess4                                                       r;   torch_forwardzZamba2MambaMixer.torch_forward  s   !-!3!3
GQ""#(G(G(W#||L9) ,~aDj/I IMMeT#||L9!''+a$2H2H.HH1t}}K\_c_r_rKrrtx  uC  uC  C  HI  I(8(>(>t55t~~V\^ )? )
%1dM2 &//15 ,D 8 l\=\=\]a]k]k=l !%77t~~VJ!IIj4;;3E3EaAg3N&NTVWM!!!1!11 HH]366u=aslKM']]..!**]-@-@-DDaH
 *;;JW
 HHT[[%?XgX%N%X%XYZ\]%^_M)%++!.1d
1K!K O OPU V#kk-$:P:PRVR_R_bfbubuRuw{  xE  xE  HL  H[  H[  x[  :\  bd  eq!YYtzz'')**  &(WW\AtSL!r!Q'{1dC<7PBa#**:rxx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!3!34B/"))$..$--I\I\]``glgtgt`uA2i=1,-B
 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PM}Y//C &,,T^^<MMSSUJ#b3.J%<<ZXJ 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A $qww/J",//*t~~2Mt}}^b^q^q"r
T^^ ;T=P=PRSTJ		-z:Az4>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''T\\(9:BR!3!34B)11*gr4==Y__aM		*gD4G4GHNNPA		*gr43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'DOO*CCtVH	*-?x-XXJ *ByM9M](()B.A cpqrtuwxay%z\]&9!Xt&W%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCAFF !99XaArsl%;h%FGL"#l&:&:1aA&Fy&Q"Q)11!Q1a@K}OdOdefhiklnoqrOstwy}  @A  uA  PB  B  G  G  LM  G  N  V  V  WX  Z[  ]^  `a  cd  eF#..va!e}=OYY8a@F))K0A0A(1aQRTV;BWY_0`$abK$nnQ1a;O!/2_Q4QT_5UUZZ_`ZaF1aA6J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*gr2A$)A33It~~Nii4(
 !%knnU.C D$$C &{s   .r-c                     t         rId| j                  j                  j                  j                  v rt               s| j                  |||      S | j                  |||      S )Ncuda)r   r   r4   devicetyper   r  r@  )r7   rJ   r   r   r   s        r;   rR   zZamba2MambaMixer.forwardq  sT     "f0C0C0J0J0O0O&OXpXr,,]L.YY!!-~NNr<   r.   NN)rS   rT   rU   r   r*   r   r0   r2   r   r	   r  r@  rR   rV   rW   s   @r;   r   r      s    Y| Yd
 Y| &*.2	T||T dlT t+	Tns% s%[`[g[gjn[n s%r &*.2	
O dl
O t+	
Or<   r   c                   8     e Zd Zddededz  f fdZddZ xZS )	Zamba2MLPNra   rd   c           	          t         	|           || _        |j                  | _        |j                  | _        || _        || _        t        j                  | j                  d| j                  z  |j                        | _
        t        j                  | j                  | j                  |j                        | _        t        |j                     | _        t        j                  g       | _        t#        | j
                        D ]  }||j$                  z  |k(  rt        j&                  t        j                  | j                  j                  | j                  j(                  d      t        j                  | j                  j(                  d| j                  z  d            }nt        j*                         }| j                   j-                  |        |j.                  }t1        |      D ci c]  \  }}||
 c}}| _        yc c}}w )aQ  
        This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer
        is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead.
        r   rf   FN)r/   r0   ra   r8   r   rc   rd   r   rr   r   gate_up_proj	down_projr   
hidden_actact_fnrk   gate_up_proj_adapter_listro   rp   rq   rt   ru   rv   rh   rw   rx   )
r7   ra   rc   rd   ry   gate_up_proj_adapterri   r}   r~   r:   s
            r;   r0   zZamba2MLP.__init__  s   
 	!--!'!9!9"4 IId&6&6D<R<R8RY_YoYop4#9#94;K;KRXRhRhiV../)+r):&t../ 	HA6(((H4')}}IIdkk55t{{7O7OV[\IIdkk66D<R<R8RY^_($
 (*{{}$**112FG	H !11;D_;UV<5%%,VVs   3H
c                     | j                  |      }| j                  |   }| | j                  |   |      z   }t        j                  |dd      }| j                  |d         |d   z  }| j                  |      }|S )Nr   r>   r   r   r)   )rI  rx   rM  r2   chunkrL  rJ  )r7   hidden_staterb   gate_up_stateoutputs        r;   rR   zZamba2MLP.forward  s    )),7NN9-	%(Q(F(Fy(QR^(__M1"={{=#34}Q7GG-r<   rE  r.   )rS   rT   rU   r*   r   r0   rR   rV   rW   s   @r;   rG  rG  ~  s%    W| WPSVZPZ W<r<   rG  c                        e Zd Zddededz  dedz  f fdZ	 	 	 ddej                  dej                  dedej                  dz  d	edz  d
ej                  dz  de
e   deej                     fdZ xZS )Zamba2AttentionDecoderLayerNra   rd   rb   c                     || _         t        |j                        }t        |   ||       t        |d||      | _        t        |||      | _        y )Nr>   )rb   rc   rd   )rc   rd   )	rd   lenrh   r/   r0   r`   	self_attnrG  feed_forward)r7   ra   rd   rb   num_gsr:   s        r;   r0   z$Zamba2AttentionDecoderLayer.__init__  sO     V,,-+(2RXckl%fRZ[r<   rJ   original_hidden_statesr   r   r   r   r   c           	          t        j                  ||gd      }| j                  |      } | j                  d|||||d|\  }}| j	                  |      }| j                  ||      }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
                concatenated tensor is then used as input of the pre-attention RMSNorm
                (see fig. 2 in https://huggingface.co/papers/2405.16712).
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        r>   r   )rJ   rb   r   r   r   r[   )r2   concatenateinput_layernormrX  pre_ff_layernormrY  )	r7   rJ   r[  rb   r   r   r   r   r   s	            r;   rR   z#Zamba2AttentionDecoderLayer.forward  s    6 ))=:P*QWYZ,,];)4>> 
')+ 3
 
q --m<))-Cr<   rE  r   )rS   rT   rU   r*   r   r0   r2   r   r	   
LongTensorr   r   r   FloatTensorrR   rV   rW   s   @r;   rU  rU    s    \| \sTz \UX[_U_ \ /3(,7;)||) !&) 	)
 t+) ) #--4) +,) 
u  	!)r<   rU  c                   (     e Zd Zdedef fdZ xZS )Zamba2MambaDecoderLayerra   rb   c                     t         |   ||       t        ||      | _        t	        |j
                  |j                        | _        y )N)ra   rb   r9   )r/   r0   r   mambarY   r8   rms_norm_epsr^  )r7   ra   rb   r:   s      r;   r0   z Zamba2MambaDecoderLayer.__init__  s;    +%VyI
,V-?-?VEXEXYr<   )rS   rT   rU   r*   r   r0   rV   rW   s   @r;   rc  rc    s    Z| Z Z Zr<   rc  c                       e Zd Zdedej
                  def fdZ	 	 	 	 	 	 	 	 ddej                  dej                  dz  de
dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dej                  dz  dej                  dz  dee   deej"                  eej"                  ej"                  f   dz  f   fdZ xZS )Zamba2HybridLayershared_transformerlinearrf  c                 :    t         |   |||       | `|| _        y r.   )r/   r0   shared_transfrj  )r7   rj  rk  rf  r:   s       r;   r0   zZamba2HybridLayer.__init__  s%     	+VU;"4r<   NrJ   r[  rb   r   causal_maskr   	use_cacher   position_idsr   r   c
           
           | j                   |f||||||	d|
}| j                  |      } | j                  |f|||||d|
}|S )ap  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
            hidden activations to form the input of the shared transformer layer.
            layer_idx (`int`): layer number.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )r[  rb   r   r   r   rp  )transformer_hidden_statesr   r   ro  r   )rj  rk  mamba_decoder)r7   rJ   r[  rb   r   rn  r   ro  r   rp  r   rr  s               r;   rR   zZamba2HybridLayer.forward  s    < %<D$;$;	%
#9&+ 3%	%
 	%
! %)KK0I$J!***
&?)+ 3
 
 r<   )NNNNNFNN)rS   rT   rU   rU  r   rr   rc  r0   r2   r   r   r	   boolr`  r   r   r   ra  rR   rV   rW   s   @r;   ri  ri    s"   5"=5GIyy5Yp5 7; $.2+/(,!&7;044||4 !&t 34 :	4
 t+4 \\D(4 4 $;4 #--44 &&-4 +,4 
u  %(9(95;L;L(L"MPT"TT	U4r<   ri  c                        e Zd ZU eed<   dZdZddgZdZdZ	dZ
dZdZeedZ ej"                          fd       Z xZS )	Zamba2PreTrainedModelra   modelTri  rc  r   )rJ   
attentionsc                    t         |   |       t        |t              rt	        j
                  t	        j                  | j                  j                        t        j                  | j                  j                        t        j                  | j                  j                        z
  z  t        j                  | j                  j                        z         j                  | j                  j                        }|t	        j                  t	        j                  |              z   }t!        j"                  |j$                  |       t	        j&                  d|j(                  dz         }t!        j"                  |j*                  t	        j                  |             t!        j,                  |j.                         y y )N)minr)   )r/   _init_weights
isinstancer   r2   r   randra   r   mathr   r   r   r  time_step_floorexpm1initcopy_r   r   r   r   ones_r   )r7   moduler  inv_dtr   r:   s        r;   r{  z#Zamba2PreTrainedModel._init_weights2  s(   f%f./

4;;44588DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566FJJv~~v.Q 0 01 45AJJv||UYYq\2JJvxx  0r<   )rS   rT   rU   r*   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_flex_attn_supports_sdpa_is_statefulrc  r`   _can_record_outputsr2   no_gradr{  rV   rW   s   @r;   rv  rv  "  sg    &*#,.GH"3NL0%
 U]]_! !r<   rv  c                       e Zd ZdZdefdZd Zeee		 	 	 	 	 	 dde
j                  dz  de
j                  dz  de
j                  dz  d	edz  d
e
j                  dz  dedz  dee   deez  fd                     Zy)Zamba2Modelzh
    Model consisting of *config.num_hidden_layers* layers.

    Args:
        config: Zamba2Config
    ra   c                 X   t         j                  | |       || _        |j                  | _        |j
                  | _        t        j                  |j
                  |j                  | j                        | _	        |j                  | _
        | j                         | _        |j                  | _        t        |j                  |j                        | _        |j"                  r1|j$                  rt&        j)                  d       t+        |      | _        d| _        | j1                          y )Nre  ze`use_long_context` set to `True`: using rescaled `rope_theta` and extended `max_position_embeddings`.F)rv  r0   ra   pad_token_idpadding_idx
vocab_sizer   	Embeddingr8   embed_tokenslayers_block_type
get_layersr   r   rY   rg  final_layernormr   use_long_contextr   r   r^   
rotary_embgradient_checkpointing	post_init)r7   ra   s     r;   r0   zZamba2Model.__init__L  s    &&tV4!.. ++LL):):F<N<NPTP`P`a!'!9!9oo'$*$?$?!,V-?-?VEXEXY&&##{ 4F;DO&+# 	r<   c                     g }i | _         d| _        g }t        | j                        D ]O  \  }}t	        | j
                  |      }|dk(  rd| d}t        |t              r"t        |      | j
                  j                  k\  rDt        |t              rt        |      }t        |      }| j                   j                  ||i       n|j                  |       || j
                  j                  z  }t        | j
                  |      }	t        j                   | j
                  j"                  | j
                  j"                  d      }
|j                  t%        |	|
|             ?|j                  |       R t        j&                  |      S )	Nr   )rb   hybridzlayers.z.shared_transformer)rd   Frf   )_tied_weights_keysfirst_transformer_layer_idrw   r  rc  ra   r|  listrW  rp   r   nextr   rv   rU  r   rr   r8   ri  rk   )r7   r   unique_hybrid_blockslayer_id
layer_typemamba_layerprefix_patterntarget_patternrd   
attn_blocklinear_layers              r;   r  zZamba2Model.get_layersc  sQ   "$*+'!$-d.D.D$E 	+ Hj1$++RKX%#*8*4G!H ##7>/0DKK4N4NN!"6=/45I/J,%)*>%?N++22NN3ST )//?#dkk&@&@@8xX
!yy)@)@$++BYBY`ef/
L+VWk*3	+4 }}V$$r<   N	input_idsr   rp  r   inputs_embedsro  r   r   c           	         |d u |d uz  rt        d      || j                  |      }|}t        j                  |      }	|r|t	        | j
                        }|V||j                         nd}
t        j                  |j                  d   |j                        |
z   }|j                  d      }t        | j
                  ||||      }| j
                  j                  r| j                  ||      }nd }t        | j                        D ]  \  }} |||	|||f||||d|} | j!                  |      }t#        ||r|	      S d 	      S )
NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either one)ra   r   r)   rC  )ra   r  r   r   rp  )rp  )r   ro  r   rp  )last_hidden_stater   )
ValueErrorr  r2   r  r
   ra   get_seq_lengthr   rE   rC  	unsqueezer   r   r  rw   r   r  r   )r7   r  r   rp  r   r  ro  r   rJ   r[  past_seen_tokensrn  r   rb   layers                  r;   rR   zZamba2Model.forward  s    -t";<s    --i8M%!&]!; 0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 ;;##"&//-l/"["& )$++ 6 	Iu!& !0#$7) M	 ,,];&+/8O
 	
>B
 	
r<   )NNNNNN)rS   rT   rU   r   r*   r0   r  r   r   r   r2   r`  r   r	   ra  rt  r   r   r   r   rR   r[   r<   r;   r  r  D  s    | . %D   .2.204(,26!%@
##d*@
 t+@
 &&-	@

 @
 ((4/@
 $;@
 +,@
 
(	(@
    @
r<   r  c                   $     e Zd Zdef fdZ xZS )Zamba2ForCausalLMra   c                 d    t         |   |       t        |      | _        | j	                          y r.   r/   r0   r  rw  r  r7   ra   r:   s     r;   r0   zZamba2ForCausalLM.__init__  &      (
r<   )rS   rT   rU   r*   r0   rV   rW   s   @r;   r  r    s    |  r<   r  c                   .    e Zd Zdef fdZee	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
dz  dej                  dz  d	ej                  dz  d
edz  deej                  z  dee   deez  fd              Z xZS )Zamba2ForSequenceClassificationra   c                 d    t         |   |       t        |      | _        | j	                          y r.   r  r  s     r;   r0   z(Zamba2ForSequenceClassification.__init__  r  r<   Nr  r   rp  r   r  labelsro  logits_to_keepr   r   c	           	          | j                   |f|||||d|	}
|
d   }| j                  |      }||j                  d   }n|j                  d   }| j                  j                  |dk7  rt        d      | j                  j                  d}n||| j                  j                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                  j                   d       |t        j                  ||j                  	      |f   }d}|! | j                   d|||| j                  d
|	}t#        |||
j$                  |
j&                  |
j(                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )r   rp  r   r  ro  r   Nr)   z=Cannot handle batch sizes > 1 if no padding token is defined.r>   )rC  r@   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  )logitsr  pooled_logitsra   )lossr  r   rJ   rx  r[   )rw  scorerE   ra   r  r  rA   rC  r2   int32r   argmaxr   r   r:   rS   loss_functionr   r   rJ   rx  )r7   r  r   rp  r   r  r  ro  r  r   transformer_outputsrJ   r  r   last_non_pad_tokennon_pad_masktoken_indicesr  r  s                      r;   rR   z'Zamba2ForSequenceClassification.forward  s   ( 8Btzz8
)%+'8
 8
 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab%4%% $V=Y]YdYdhnD 0 /??-;;*55
 	
r<   )NNNNNNNr   )rS   rT   rU   r*   r0   r   r   r2   r`  r   r	   ra  rt  r   r   r   r   r   rR   rV   rW   s   @r;   r  r    s   | 
  .2.204(,26*.!%-.@
##d*@
 t+@
 &&-	@

 @
 ((4/@
   4'@
 $;@
 ell*@
 +,@
 
1	1@
  @
r<   r  )r  r  r  rv  )Mr~  collections.abcr   	itertoolsr   r2   r    r   r  activationsr   cache_utilsr	   r
   integrations.hub_kernelsr   masking_utilsr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   utils.import_utilsr   utils.output_capturingr   llama.modeling_llamar   r   mamba2.modeling_mamba2r   r   r   zamba.modeling_zambar    r!   r"   r#   r$   r%   r&   r'   r(   configuration_zamba2r*   _CONFIG_FOR_DOC
get_loggerrS   r   Moduler,   rY   r^   r`   r   rG  rU  rc  ri  rv  r  r  r  __all__r[   r<   r;   <module>r     sR    $    & ! . 8 / Y F & l l 7 9 5 M Y Y
 
 
 / '			H	%; ;*	L 		0 	j)n j)ZzOryy zOz'		 'T1"< 1hZ4 Z<( <~ !O ! !BD
*3 D
N( H
&D H
Vr<   