
    iA                        d dl Z d dlmZ d dlZd dlmc mZ d dlmZ ddlm	Z
 ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ  G d dej:                        Z G d dej:                        Ze G d de             Z dejB                  defdZ"e G d de              Z# G d de       Z$g dZ%y)    N)Callable)nn   )initialization)Cache)create_bidirectional_mask)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)auto_docstringcan_return_tuple)maybe_autocast   )	AutoModel   )	PI0Configc                   4     e Zd Z fdZed        Zd Z xZS )PI0TimestepEmbeddingsc                 z    t         |           || _        | j                  |      }| j	                  d|d       y )Nsinusoid_freqF)
persistent)super__init__configcompute_freqsregister_buffer)selfr   r   	__class__s      u/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/pi0/modeling_pi0.pyr   zPI0TimestepEmbeddings.__init__(   s:    **62_mN    c                    t        j                  dd| j                  j                  dz  t         j                        }| j
                  | j                  | j
                  z  |z  z  }d|z  dz  t        j                  z  }|S )N              ?r   dtype)	torchlinspace
dit_confighidden_sizefloat32
min_period
max_periodmathpi)r   fractionperiodr   s       r   r   z#PI0TimestepEmbeddings.compute_freqs.   sr    >>#sF,=,=,I,IQ,NV[VcVcd""f&7&7&:K:K&KPX%XXfq(4772r    c                    t        |j                  j                  t              r/|j                  j                  dk7  r|j                  j                  nd}t	        |d      5  | j
                  d d d f   }||d d d f   z  }t        j                  |j                         |j                         gd      }d d d        |S # 1 sw Y   S xY w)NmpscpuF)device_typeenabledr   dim)

isinstancedevicetypestrr   r   r&   catsincos)r   timer4   r   embtime_embedss         r   forwardzPI0TimestepEmbeddings.forward5   s    *4T[[5E5Es*KPTP[P[P`P`diPidkk&&otUC 	C ..tQw7M$q$w-/C))SWWY	$:BK	C 		C s   #ACC)__name__
__module____qualname__r   staticmethodr   rB   __classcell__r   s   @r   r   r   '   s"    O  r    r   c                   $     e Zd Z fdZd Z xZS )PI0ActionTimeEmbeddingc                 8   t         |           t        |      | _        t	        j
                  |j                  |j                  j                        | _	        t	        j
                  |j                  |j                  j                        | _        t	        j
                  d|j                  j                  z  |j                  j                        | _        t	        j
                  |j                  j                  |j                  j                        | _        y )Nr   )r   r   r   sinusoid_embedsr   Linearmax_action_dimr(   r)   action_in_projmax_state_dim
state_projaction_time_mlp_inaction_time_mlp_outr   r   r   s     r   r   zPI0ActionTimeEmbedding.__init__?   s    4V< ii(=(=v?P?P?\?\]))F$8$8&:K:K:W:WX"$))A0A0A0M0M,MvO`O`OlOl"m#%99V->->-J-JFL]L]LiLi#j r    c                    | j                  |      }| j                  |      }| j                  |      }|d d d d d f   j                  |      j	                  |j
                        }t        j                  ||gd      }| j                  t        j                  | j                  |                  }t        j                  |d d d d d f   |gd      }|S )Nr$   r   r6   r   )rQ   rO   rL   	expand_astor%   r&   r<   rS   FsilurR   )	r   statenoisetimestepstate_embedsaction_embedsrA   action_time_embedsaction_embeds_mergeds	            r   rB   zPI0ActionTimeEmbedding.forwardG   s    u-++E2**84!!T1*-77FIIP]PcPcId"YY{'CK!55affT=T=TUg=h6ij$yy,q$z*BDV)W]^_##r    )rC   rD   rE   r   rB   rG   rH   s   @r   rJ   rJ   >   s    k
$r    rJ   c                   T     e Zd ZU eed<   dZdZdZdgZdZ	dZ
dZdZdZdZ fdZ xZS )PI0PreTrainedModelr   modelrZ   Tpast_key_values)imagetextc                     t         |   |       t        |t              r:t	        j
                  |j                  |j                  |j                               y y N)	r   _init_weightsr8   r   initcopy_r   r   r   )r   moduler   s     r   ri   z PI0PreTrainedModel._init_weightsb   sC    f%f34JJv++V-A-A&---PQ 5r    )rC   rD   rE   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendinput_modalitiesri   rG   rH   s   @r   rb   rb   T   sR    O&*##4"5N!"&(R Rr    rb   block_boundariesreturnc           
      P     dt         dt         dt         dt         dt        f
 fd}|S )N	batch_idxhead_idxq_idxkv_idxry   c                 f    t        j                  |      }t        j                  |      }||k  S rh   )r&   	bucketize)r{   r|   r}   r~   q_blockkv_blockrx   s         r   
inner_maskz0blockwise_bidirectional_mask.<locals>.inner_maski   s0    //%)9:??6+;<7""r    )intbool)rx   r   s   ` r   blockwise_bidirectional_maskr   h   s3    #c #S # #c #d #
 r    c                   >    e Zd Zdef fdZd Zd ZddZee		 	 	 	 	 	 	 dde
j                  de
j                  dz  d	e
j                  dz  d
e
j                  dz  de
j                  dz  de
j                  dz  de
j                  dz  dedz  defd              Z xZS )PI0Modelr   c                     t         |   |       t        j                  |j                        | _        t        j                  |j                        | _        | j                          y rh   )	r   r   r   from_configr(   dit
vlm_configvlm	post_initrT   s     r   r   zPI0Model.__init__s   sJ     (():):;(():):;r    c                 6    | j                   j                         S rh   )r   get_input_embeddings)r   s    r   r   zPI0Model.get_input_embeddingsy   s    xx,,..r    c                 :    | j                   j                  |       y rh   )r   set_input_embeddings)r   values     r   r   zPI0Model.set_input_embeddings|   s    %%e,r    Nc                    |j                   d   }|j                  dd      }| j                  j                  |      j                  }|j                  d||j                   d   |j                   d         }g }t        |      D ]  \  }}	||   |	   }
|j                  |
         t        j                  |d      }|j                         }d||| j                  j                  j                  k(  <    | j                  j                         |      }|| j                  j                  j                  k(  j                  d      j!                  |      j#                  |j$                        }|j'                  ||      }|S )Nr   r   r   r6   )shapeflattenr   get_image_featurespooler_outputreshape	enumerateappendr&   r<   cloner   r   image_token_idr   	unsqueezerV   rW   r9   masked_scatter)r   	input_idspixel_valuespixel_attention_maskattention_maskmax_num_camerasimage_featurestotal_image_featuresr{   maskunpadded_image_featuresllm_input_idsinputs_embedsspecial_image_masks                 r   embed_prefixzPI0Model.embed_prefix   sa   .44Q7#++Aq144\BPP'//O^EYEYZ[E\^l^r^rst^uv!()=> 	AOIt&4Y&?&E# ''(?@	A  %yy)=1E!)LMi4;;#9#9#H#HHI7557F$++00???Yr]Y}%R$$%	 	 &445GI]^r    r^   r   r   r   r   position_idsr   rd   ry   c	           	         |n|l|||j                  d      dz
  }|| j                  |||      }t        j                  |      dddddf   }
| j	                  ||||
d      j
                  }||j                  dk7  rt        d      dx}}|t        j                  |j                  d   |j                  d   |j                  |j                  	      }t        j                  ||gd
      }t        j                   |d
      dz
  dd|j                  d    df   }|j                         }t        j                  |dz   |j                  d   dz
  g|j                        }t        j                   |d
      dz
  }t        | j                   j"                  |||t%        |            } | j&                  d||||d|	}|S )z
        action_embeds (`torch.Tensor`, *optional*):
            The embeddings of input actions and robot states.
        pixel_attention_mask (`torch.Tensor`, *optional*):
            The mask indicating padded positions in the input image.
        Nr   r   r   T)r   r   r   token_type_ids	use_cacher   z:Only two-dimensional attention masks are accepted for now!r%   r9   r6   )r9   )r   r   r   rd   and_mask_function)r   r   r   rd    )cumsumr   r&   
zeros_liker   rd   ndim
ValueErroronesr   r%   r9   r<   get_seq_lengthtensorr   r   r(   r   r   )r   r^   r   r   r   r   r   r   rd   kwargsr   dit_position_idsdit_attention_mask
noise_maskvlm_input_lengthblock_sizesrx   bidirectional_mask
dit_outputs                      r   rB   zPI0Model.forward   s   ( #(?)l.B-44R81<$ $ 1 1)\K_ `"--m<Q1WEN"hh+-)- '  o  %.*=*=*BYZZ 154-%##A&##A&$**%,,	J "'NJ+GQ!O %-?Q G! KQQ^QdQdefQgPgPiMij +99;ll$4q$8-:M:Ma:PST:T#U^k^r^rs <<;a?6;;))'-+:;KL
 TXX 
'-)+	

 

 r    rh   )NNNNNNN)rC   rD   rE   r   r   r   r   r   r   r   r&   Tensor
LongTensorr   r	   rB   rG   rH   s   @r   r   r   q   s    y /-2  *.,0.24804-1(,E||E <<$&E llT)	E
 t+E $llT1E &&-E ||d*E E 
!E  Er    r   c                       e Zd ZdZddiZdef fdZee	 	 	 	 	 	 	 	 	 	 dde	j                  de	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  dedz  de	j                  defd              Z e	j"                         	 	 	 	 dde	j                  d
e	j                  de	j                  de	j                  dz  de	j                  dz  de	j                  dz  dedz  de	j                  fd       Z xZS )PI0ForConditionalGenerationz9PI0 model with action projection heads and flow matching.action_out_projcolwise_gather_outputr   c                    t         |   |       t        |      | _        |j                  j
                  | _        t        |      | _        t        j                  | j                  |j                        | _        | j                          y rh   )r   r   r   rc   r(   r)   expert_hidden_sizerJ   embed_action_timer   rM   rN   r   r   rT   s     r   r   z$PI0ForConditionalGeneration.__init__   sf     f%
"("3"3"?"?!7!?!yy)@)@&BWBWXr    NrZ   r[   r\   r   r   r   r   r   r   rd   actionsry   c                    |j                   d   }|t        j                  | j                  j                  t        j
                        }t        j                  | j                  j                  t        j
                        }t        j                  j                  ||      }|j                  |f      j                  |j                        }|| j                  j                  z  | j                  j                  z   j                         }|Vt        j                  || j                  j                   | j                  j"                  |j                  |j$                        }|7|ddddf   }||z  d|z
  |z  z   j                  |j$                        }||z
  }n|}| j'                  |||      } | j(                  d	||||||	||
d|}|j*                  dd| j                  j                    df   }| j-                  |      }d}|,t/        j0                  || j                  j2                        }t5        |||j6                  |j8                  |j:                        S )
a-  
        state (`torch.Tensor`, *optional*):
            Current robot state.
        noise (`torch.Tensor`, *optional*):
            Random noise at current timestep that needs to be denoised
        timestep (`torch.Tensor`, *optional*):
            Current denoising timestep.
        pixel_attention_mask (`torch.Tensor`, *optional*):
            The mask indicating padded positions in the input image.
        actions (`torch.Tensor`, *optional*):
            Input actions that need to be predicted. Used only when training to compiute loss.
        r   Nr$   )r9   r%   r   )r   r   r   r   r   r   r^   rd   )	reduction)losslogitsrd   hidden_states
attentionsr   )r   r&   r   r   time_sampling_beta_alphar*   time_sampling_beta_betadistributionsBetasamplerW   r9   time_sampling_scaletime_sampling_offsetfloatrandn
chunk_sizerN   r%   r   rc   last_hidden_stater   rX   mse_lossloss_reductionr
   rd   r   r   )r   rZ   r[   r\   r   r   r   r   r   r   rd   r   r   
batch_sizealpha_tbeta_tdist	time_betatime_expandednoisy_actionstarget_velocityr_   outputslast_hidden_statespredicted_velocityr   s                             r   rB   z#PI0ForConditionalGeneration.forward   s'   : [[^
 ll4;;#G#Gu}}]G\\$++"E"EU]][F&&++GV<DZM255ellCI!DKK$C$CCdkkFfFffmmoH =KK&&**||kkE $Qd]3M*U2a-6G75RRVVW^WdWdeM#goO!M "33E=(S$** 

%)!5%',+

 

 %66q4;;;Q;Q:Q:S7ST!112DE::o/AT[[MgMghD%%#33!//))
 	
r    	num_stepsc           	         |xs | j                   j                  }|j                  d   }	|j                  }
|Ot	        j
                  dd|	| j                   j                  | j                   j                  f|j                  |
      }||j                  d      dz
  }| j                  j                  |||      }| j                  j                  ||dd      j                  }|j                         }d	|z  }t        |      D ]p  }d||z  z   }t	        j                   |t        j"                  |

      j%                  |	      } | ||||||      }|j'                  |       |||j(                  z  z   }r |S )z0Run flow matching inference to generate actions.r   r"   r#   )meanstdsizer%   r9   r   r   T)r   r   r   r   return_dictg      r   )rZ   r[   r\   r   r   rd   )r   num_inference_stepsr   r9   r&   normalr   rN   r%   r   rc   r   r   rd   r   ranger   r*   expandcropr   )r   rZ   r   r   r[   r   r   r   r   r   r9   r   r   rd   prefix_lengthdtstepr?   time_tensoroutputs                       r   sample_actionsz*PI0ForConditionalGeneration.sample_actionsG  s    @!@!@	__Q'
!! =LLKK**KK..
 #((
E %)004q8L

//	<I]^**..')% ) 
 / 	 (668 I)$ 	/D?D,,t5==PWWXbcK$%9- /F   /B..E	/ r    )
NNNNNNNNNN)NNNN)rC   rD   rE   __doc___tp_planr   r   r   r   r&   FloatTensorr   
BoolTensorr   r   r
   rB   no_gradr   r   rG   rH   s   @r   r   r      s   C!#:;Hy   +/-1)-,08<.204-1(,%)T
  T
   4'T
 ##d*	T

 <<$&T
 llT)T
 $..5T
 t+T
 &&-T
 ||d*T
 T
 ""T
 
 T
  T
l U]]_ +/.28< $=  = ##= ''	=
   4'= t+= $..5= := 
		= =r    r   )rb   r   r   )&r-   collections.abcr   r&   torch.nn.functionalr   
functionalrX    r   rj   cache_utilsr   masking_utilsr   modeling_outputsr	   r
   modeling_utilsr   utilsr   r   utils.genericr   autor   configuration_pi0r   Moduler   rJ   rb   r   r   r   r   __all__r   r    r   <module>r     s   *  $     &   6 O - 5 +  (BII .$RYY $, R R R&5<< H  m! m m`c"4 cL Lr    