
    i\                        d dl mZ d dlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZ  e	       rd dlZ ej                  e      Z	 	 d,d	ej                   d
ej                   dej                   dz  dedej                   f
dZdej&                  j(                  dej                   dej                   dej                   dej                   f
dZd	ej                   d
ej                   dej                   dej                   fdZd	ej                   d
ej                   dej                   dej                   fdZd Zd Z e	       rXej4                  j7                  ded       ej4                  j9                  de       ej4                  j;                  dee       d	ej                   d
ej                   dej                   defdZd	ej                   d
ej                   dej                   dej                   fdZ	 	 d,d	ej                   d
ej                   dej                   dej                   dz  dedej                   fdZ dej&                  j(                  dej                   dej                   dej                   dej                   f
d Z! G d! d"e      Z" e"       Z#d#ej                   dej                   fd$Z$	 d-e#ddd%d&d'e%ej&                  j(                     dz  d(e"ded)ed*ede%ej&                  j(                     fd+Z&y).    )Callable)wraps   )logging)GeneralInterface)is_torch_availableis_torch_less_or_equalis_torchdynamo_compilingNFinputweightbiasis_transposedreturnc                     |r5t        j                  | j                  d      |      j                  d      }n4t        j                  || j                  d            j                  d      }|||z   }|S )a  Batched linear layer supporting optional bias and transposed weights.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (batch_size, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (batch_size, output_dim, input_dim) if transposed is `False`,
            else of shape (batch_size, input_dim, output_dim).
        bias (`torch.Tensor`, *optional*):
            Bias tensor of shape (batch_size, output_dim). Default is `None`.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the weight tensor is transposed.
    Returns:
        `torch.Tensor`: Output tensor of shape (batch_size, output_dim).
       )torchbmm	unsqueezesqueeze)r   r   r   r   outs        n/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/integrations/moe.py_batched_linearr   E   se    * ii*F3;;A> ii 34<<R@DjJ    selfhidden_statestop_k_indextop_k_weightsc                 ~   |j                   }|j                  d      }|j                  d      }|j                  d      }t        j                  ||      j	                  d      j                  d|      j                  d      }|j                  d      }	|j                  d      }
|
| j                  k\  }|
j                  d| j                  dz
        }
||   }| j                  r-| j                  |
   }| j                  r| j                  |
   nd }n,| j                  |
   }| j                  r| j                  |
   nd }t        |||| j                         }| j                  r| j#                  |      }n| j%                  |      }| j&                  |
   }| j                  r| j(                  |
   nd }t        |||| j                         }||	j	                  d      z  }|j+                  |j	                  d      d       |j-                  |||      j/                  d      }|j1                  |j2                        S )Nr   r   devicer   r   r   g        dim)r!   sizer   aranger   expandreshapenum_expertsclamphas_gategate_up_projhas_biasgate_up_proj_biasup_projup_proj_biasr   r   _apply_gateact_fn	down_projdown_proj_biasmasked_fill_viewsumtodtype)r   r   r   r   r!   	num_top_k
num_tokens
hidden_dim	token_idxsample_weights
expert_idsinvalid_maskselected_hidden_statesselected_weightsselected_biasesproj_outweighted_outfinal_hidden_statess                     r   batched_mm_experts_forwardrG   g   s&    !!F  $I##A&J##B'J Z7AA!DKKBPYZbbcefI"**2.N$$R(J !1!11L!!!T%5%5%9:J +95 }},,Z8@D$00<SW<<
3;?==$++J7d  0VZVhVhH
 }}##H- ;;x( ~~j19=d))*5DO "HZHZH
 n66r::Ll44R8#> '++J	:NRRWXRY!!-"5"566r   offsc                 4   t        j                  | j                  d      |j                  d      | j                  | j                        }d}t        |j                               D ].  \  }}||k(  rt        j                  | || ||   |||        |}0 |S )a(  
    Fallback grouped matrix multiplication used when `torch.nn.functional.grouped_mm` and `torch._grouped_mm`
    are unavailable or incompatible with `torch.compile` (e.g. non-bfloat16 weights).

    Args:
        input (`torch.Tensor`): Input of shape (S, input_dim), sorted by expert id.
        weight (`torch.Tensor`): Expert weights of shape (num_experts, input_dim, output_dim).
        offs (`torch.Tensor`): Cumulative token counts per expert of shape (num_experts,).
    Returns:
        `torch.Tensor`: Output of shape (S, output_dim).
    r   r   r!   r9   r   )r   zerosr%   r!   r9   	enumeratetolistmm)r   r   rH   outputstartiends          r   _grouped_mm_fallbackrT      s     [[AAu||SXS^S^_FE DKKM* 3C<uS!6!9&s2CD	 Mr   c                 p   | j                         dk(  sJ dt        | j                                |j                         dk(  sJ dt        |j                                |j                         dk(  sJ dt        |j                                |j                  d      |j                  d      k(  s+J d|j                  d       d	|j                  d              | j                  d      |j                  d      k(  s+J d
| j                  d       d|j                  d              |j                  t
        j                  t
        j                  fv sJ d|j                          t        j                  | j                  d      |j                  d      | j                  | j                        S )zRShape/dtype inference stub for `_grouped_mm_fallback` required by `torch.compile`.r   z+input must be 2D (S, input_dim), got shape    zBweight must be 3D (num_experts, input_dim, output_dim), got shape r   z*offs must be 1D (num_experts,), got shape r   zoffs length z must match number of experts zinput_dim mismatch: input has z, weight has z$offs must be an integer tensor, got rJ   )
r$   tupleshaper%   r9   r   int32int64emptyr!   r   r   rH   s      r   _grouped_mm_fallback_faker]      s   99;!_J5QVQ\Q\K]J^__::<1 
LUSYS_S_M`Lab 88:?\HtzzIZH[\\?99Q<6;;q>)v\$))A,Geflfqfqrsfteu+vv)::a=FKKN* 
(A}V[[QR^DTU* ::%++u{{33h7[\`\f\f[g5hh3;;uzz!}fkk!nU\\QVQ\Q\]]r   c                 H    | j                  |d   |d          |d   | _        y)zjSaves input and weight for backward; offs is stored directly as it is a non-differentiable integer tensor.r   r   r   N)save_for_backwardrH   )ctxinputsrP   s      r   "_grouped_mm_fallback_setup_contextrb      s%    &)VAY/ayCHr   c                    | j                   \  }}t        j                  |      }t        j                  |      }d}t        | j                  j                               D ]c  \  }}||k(  rt        j                  ||| ||   j                  |||        t        j                  ||| j                  ||| ||          |}e ||dfS )zuBackward pass for `_grouped_mm_fallback`. Computes grad_input and grad_weight per expert group; offs has no gradient.r   rK   N)saved_tensorsr   
zeros_likerM   rH   rN   rO   T)	r`   grad_outputr   r   
grad_inputgrad_weightrQ   rR   rS   s	            r   _grouped_mm_fallback_backwardrj      s    %%ME6!!%(J""6*KE CHHOO-. 3C<U3'*U3:OPuS!##[s%;QP {D((r   z!transformers::grouped_mm_fallback )mutates_args)setup_contextc                 b   t               r|j                  t        j                  k7  sR|j                  j
                  dk(  r:t        dd      r-|j                         dz  dk7  s| j                         dz  dk7  ryt        t        j                  j                  d      xs t        t        d	      S )
a  
    Check if torch.nn.functional.grouped_mm or torch._grouped_mm can be used based on availability and compatibility with torch.compile.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, input_dim, output_dim).
        offs (`torch.Tensor`):
            Offsets tensor indicating the boundaries of each group in the input tensor.
    Returns:
        `bool`: True if grouped_mm can be used, False otherwise.
    cpuz2.10.0T)
accept_dev   r   F
grouped_mm_grouped_mm)r
   r9   r   bfloat16r!   typer	   data_ptrhasattrnn
functionalr\   s      r   _can_use_grouped_mmrz      s     	!"v||u~~'Ee#"8=__#q(ENN,<r,AQ,F 588&&5V9VVr   c                    t        | ||      rt        t        j                  j                  d      rEt        j                  j                  j                  | j                  |j                        ||      S t        t        d      r1t        j                  | j                  |j                        ||      S t        j                  j                  j                  | ||      S )a  Grouped matrix multiplication dispatcher that uses torch.nn.functional.grouped_mm if available, else falls back to torch._grouped_mm.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, input_dim, output_dim).
        offs (`torch.Tensor`):
            Offsets tensor indicating the boundaries of each group in the input tensor.
    Returns:
        `torch.Tensor`: Output tensor of shape (S, output_dim).
    rr   rH   rs   )rz   rw   r   rx   ry   rr   r8   r9   rs   opstransformersgrouped_mm_fallbackr\   s      r   rs   rs     s    $ 5&$/
 588&&588&&11%((6<<2H&W[1\\UM*$$UXXfll%;V$OO99!!55eV$5OOr   c                 r    |rt        | ||      }nt        | |j                  dd      |      }|||z   }|S )a  Grouped linear layer supporting optional bias and transposed weights.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, input_dim, output_dim) if `is_transposed`,
            else of shape (num_experts, output_dim, input_dim).
        offs (`torch.Tensor`):
            Offsets tensor indicating the boundaries of each group in the input tensor.
        bias (`torch.Tensor`, *optional*):
            Bias tensor of shape (num_experts, output_dim). Default is `None`.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the weight tensor is transposed.
    Returns:
        `torch.Tensor`: Output tensor of shape (S, output_dim).
    r|   r   )rs   	transpose)r   r   rH   r   r   r   s         r   _grouped_linearr   8  sF    0 %d3 %!1!1"b!9EDjJr   c                    |j                   }|j                  d      }|j                  d      }|j                  d      }t        j                  ||      j	                  d      j                  d|      j                  d      }|j                  d      }	|j                  d      }
t        j                  |
      }t        j                  |      }t        j                  |j                  d      |      ||<   |
|   }|	|   }|||      }|j                  dk(  r|j                         n|j                         }t        j                  || j                  d| j                  dz
        }t        j                  |dt        j                        }| j                   r*| j"                  }| j$                  r| j&                  |   nd }n)| j(                  }| j$                  r| j*                  |   nd }t-        ||||| j.                        }| j                   r| j1                  |      }n| j3                  |      }| j4                  }| j$                  r| j6                  |   nd }t-        ||||| j.                        }||j	                  d      z  }||   }|j9                  |||      j;                  d	      }|j=                  |j>                        S )
Nr   r   r    r   ro   )binsminmax)r$   r9   r"   r#   ) r!   r%   r   r&   r   r'   r(   argsort
empty_likeru   floatinthistcr)   cumsumrY   r+   r,   r-   r.   r/   r0   r   r   r1   r2   r3   r4   r6   r7   r8   r9   )r   r   r   r   r!   r:   r;   r<   r=   r>   r?   perminv_permexpert_ids_gsample_weights_gselected_hidden_states_ghistc_inputtokens_per_expertoffsetsrB   rC   rD   rE   rF   s                           r   grouped_mm_experts_forwardr   ^  s    !!F  $I##A&J##B'J Z7AA!DKKBPYZbbcefI"**2.N$$R(J ==$D%H\\$))A,v>HTNd#L%d+,Yt_=
 +1++*>,$$&LDTDTDVKKd6F6FASWScScfgSghll,!5;;GG }},,BF--$00>UY<<=A]]$++L9PT  "2G/aeasasH
 }}##H- ;;x( ~~;?==d)),7dO "G/QUQcQcH
 .88<<L  )L '++J	:NRRWXRY!!-"5"566r   c                   :     e Zd ZdZeedZdededef fdZ	 xZ
S )ExpertsInterfacez;Interface for registering custom experts forward functions.)
batched_mmrr   experts_implementationdefaultr   c                     |t         j                  d       n|dk7  r|| vrt        d| d      t        |   ||      S )zfReturn the requested `experts_implementation`. Also strictly check its validity, and raise if invalid.a
  You tried to access the `ExpertsInterface` with a `config._experts_implementation` set to `None`. This is expected if you use an Expert Module as a standalone Module. If this is not the case, something went wrong with the dispatch of `config._experts_implementation`eager`zL` is not a valid experts implementation registered in the `ExpertsInterface`)loggerwarning_onceKeyErrorsuperget)r   r   r   	__class__s      r   get_interfacezExpertsInterface.get_interface  s`    !)N
 $w.3IQU3U*++wx  w{17;;r   )__name__
__module____qualname____doc__rG   r   _global_mappingstrr   r   __classcell__)r   s   @r   r   r     s4    E 10O
<C <( <x < <r   r   gate_up_outc                 V    |j                  dd      \  }}| j                  |      |z  S )a  
    Default gating mechanism: splits the gate_up_out into gate and up parts,
    applies the activation function to the gate part, and multiplies it with the up part.
    Args:
        gate_up_out (`torch.Tensor`):
            The output tensor from the gate and up projection of shape (S, 2 * intermediate_dim).
    Returns:
        `torch.Tensor`: The gated output tensor of shape (S, intermediate_dim).
    r   r   r#   )chunkr2   )r   r   gateups       r   _default_apply_gater     s1        +HD";;tr!!r   T)experts_interfacer   r-   r+   experts_classr   r-   r+   c                    dt         t        j                  j                     dt         t        j                  j                     ffd}|  ||       S |S )a  Decorator to modify experts class to support different experts implementations.

    Args:
        experts_class (`type[torch.nn.Module]`, *optional*):
            The experts class to modify. If not provided, returns a decorator that can be applied to the class.
        experts_interface (`ExpertsInterface`, *optional*, defaults to `ALL_EXPERTS_FUNCTIONS`):
            The experts interface to use for dispatching the forward method.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the expert weights are stored in transposed format.
        has_bias (`bool`, *optional*, defaults to `False`):
            Whether the expert layers include bias terms.

    Returns:
        `type[torch.nn.Module]`: The modified experts class.
    r   r   c                     | j                   | j                  t              fd       }t              fd       }t        | d      st        | _        || _         || _        | S )Nc                 X     | |g|i | || _         | _        | _        | _        y N)configr+   r-   r   )r   r   argskwargsr-   r+   r   original_inits       r   __init__z=use_experts_implementation.<locals>.wrapper.<locals>.__init__  s4    $888 DK$DM$DM!.Dr   c                 h    j                  | j                  j                        } || g|i |S r   )r   r   _experts_implementation)r   r   r   experts_forwardr   original_forwards       r   forwardz<use_experts_implementation.<locals>.wrapper.<locals>.forward  s5    /==dkk>a>acstO"49$9&99r   r1   )r   r   r   rw   r   r1   )	r   r   r   r   r   r   r-   r+   r   s	      @@r   wrapperz+use_experts_implementation.<locals>.wrapper  su    %..(00	}		/ 
	/ 
	 	: 
!	: }m4(;M%!) 'r   )ru   r   rx   Module)r   r   r   r-   r+   r   s    ```` r   use_experts_implementationr     sM    0tEHHOO4 ehhoo9N  0  }%%Nr   )NFr   )'collections.abcr   	functoolsr   utilsr   utils.genericr   utils.import_utilsr   r	   r
   r   
get_loggerr   r   Tensorboolr   rx   r   rG   rT   r]   rb   rj   library	custom_opregister_fakeregister_autogradrz   rs   r   r   r   ALL_EXPERTS_FUNCTIONSr   ru   r   rk   r   r   <module>r      sz   %   , e e  
		H	%Z !%	<<LL ,,
 	
 \\DA7
((//A7<<A7 A7 <<	A7
 \\A7N ell %,, [`[g[g 4^U\\ ^5<< ^u|| ^`e`l`l ^)& 	MM?AUdfg	MM CE^_	MM##+%8 $ Wu|| WU\\ W WZ^ W>P<<PLLP ,,P \\	PF !%#<<#LL# ,,# ,,
	#
 # \\#LO7
((//O7<<O7 O7 <<	O7
 \\O7d<' <. )* "5<< "ELL " 373 +@3(4/3 (3 	3
 3 3 
%((//3r   