
    iem                        d Z ddlZddlmZ ddlZddlZddlmc m	Z
 ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0m1Z1 ddl2m3Z3 ddl4m5Z5  e)jl                  e7      Z8e' G d de5             Z9 G d de!d      Z:e' e-d       G d  d!e3                    Z; e'd"#      e G d$ d%e                    Z<d&ejz                  d'efd(Z> G d) d*ej~                        Z@ G d+ d,ej~                        ZAe' G d- d.e             ZBe' G d/ d0eB             ZC G d1 d2eB      ZDg d3ZEy)4zTPI0 model: PaliGemma + Action Expert with flow matching for robot action prediction.    N)Callable)strict)nn   )initialization)Cache)PreTrainedConfig)BatchFeature)
ImageInputmake_nested_list_of_images)create_bidirectional_mask)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)ProcessingKwargsUnpack)PreTokenizedInput	TextInput)auto_docstringcan_return_tuplelogging)maybe_autocast)requires   )CONFIG_MAPPING
AutoConfig	AutoModel)PaligemmaProcessor)SiglipImageProcessorc                   $    e Zd ZdddZdddZdZy)PI0ImageProcessor   )
max_height	max_width)heightwidthTN)__name__
__module____qualname__sizepad_sizedo_pad     t/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/pi0/modular_pi0.pyr!   r!   .   s    C0D,HFr.   r!   c                   "    e Zd ZddddddidZy)	PI0ProcessorKwargs
max_length0   right)paddingr2   padding_sidereturn_tensorspt)text_kwargscommon_kwargsN)r'   r(   r)   	_defaultsr-   r.   r/   r1   r1   5   s#     $#

 +D1Ir.   r1   F)total)visiontorch)backendsc                       e Zd Zd fd	Z	 	 	 ddeee   z  eee      z  dz  deez  ee   z  ee   z  dz  deej                  z  e
j                  z  dz  deej                  z  e
j                  z  dz  dee   defd	Ze fd
       Z xZS )PI0ProcessorNc                 D   |j                   d   |j                   d   c| _        | _        |j                  dg d      }|j                  dg d      }|j                  dg d      }|j                  d	g d
      }t	        j
                  |      | _        t	        j
                  |      | _        t	        j
                  |      | _        t	        j
                  |      | _	        |j                  dd      | _
        |j                  dd      | _        t        	| 5  ||       y )Nr%   r&   
state_mean)ggsgr?g	h"l?gW2D@g\ AcgZd;OſgB>٬?gQI	state_std)gt$~?gL
F%u?g.!u?g/n?g6?gx?g]K=?gF%u?actions_mean)g&S?gX ?gW[재gHPsr?gg?g 	gHPsactions_std)gGz?g`"?g9#J{?gvOjM?g>yX5ͫ?g46<R?gj+?max_state_dim    
chunk_size2   )r*   r%   r&   getr>   tensorrC   rD   rE   rF   rG   rI   super__init__)
selfimage_processor	tokenizerchat_templatekwargsrC   rD   rE   rF   	__class__s
            r/   rN   zPI0Processor.__init__C   s    "1"6"6x"@/BVBVW^B_TZZZ.rs
JJ{,lm	zz.2mnjj0hi,,z2i0!LL6 <<4#ZZ< **\26)4r.   imagestextactionsstaterS   returnc                     | j                   t        fd| j                  j                  i|}|t        j                  d       d}t        |t              r|g}t        |      }t        |      t        |      k7  r$t        dt        |       dt        |       d      |d   j                  d	d      }|d
   j                  d	d       g }	t        ||      D ]V  \  }
}| j                  | j                  z  t        |      z   | j                  j                   |
 d}
|	j!                  |
       X  | j                  |	fi |d   }t#        d |D              }t%        j&                  t        |      |ft$        j(                        }t%        j&                  t        |      |d| j*                  | j,                        }t/        |      D ]=  \  }} | j0                  |fd	di|d
   }t        |      }d||d|f<   |d   ||d|f<   ? i |||d}|t%        j2                  |      | j4                  z
  | j6                  dz   z  }|j8                  d   | j:                  k  r2t=        j>                  |d| j:                  |j8                  d   z
  f      }|jA                  d| jB                  | j:                        |d<   |t%        j2                  |      | jD                  z
  | jF                  dz   z  }|j8                  d   | j:                  k  r2t=        j>                  |d| j:                  |j8                  d   z
  f      }|jA                  d| j:                        |d<   tI        ||      S )aC  
        actions (`list | np.ndarray | torch.Tensor`, *optional*):
            Actions to be predicted by the model. If provided, padding, mean and std normalization will be applied.
        state (`list | np.ndarray | torch.Tensor`, *optional*):
            Robotic states to be predicted by the model. If provided, padding, mean and std normalization will be applied.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix`
              is provided, the `input_ids` will also contain the suffix input ids.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
            - **pixel_attention_mask** -- Pixel values padding mask to be fed to a model. Returned when `images` is not `None`.
            - **state** -- Robot state compatible with model if `state` is not None
            - **actions** -- Label-actions compatible with training if `actions` is not None
        tokenizer_init_kwargsNzPYou are using PI0 without a text prefix. The processor will use an empty prompt. z	Received z image samples for z\ prompts. Each prompt should be associated with one sample (with one or more camera images).r9   r7   images_kwargs
c              3   2   K   | ]  }t        |        y wN)len).0sample_imagess     r/   	<genexpr>z(PI0Processor.__call__.<locals>.<genexpr>   s     U]c-0Us   dtyper   r8   Tpixel_values)rg   pixel_attention_maskg:0yE>r   rW   rX   )datatensor_type)%_merge_kwargsr1   rQ   init_kwargsloggerwarning_once
isinstancestrr   ra   
ValueErrorpopzipimage_tokenimage_seq_length	bos_tokenappendmaxr>   zerosboolr%   r&   	enumeraterP   rL   rE   rF   shaperG   FpadviewrI   rC   rD   r
   )rO   rU   rV   rW   rX   rS   output_kwargsbatched_imagesr7   prompt_stringssample
image_listtext_inputsmax_num_camerasrh   padded_pixel_valuesbatchrc   	processednum_camerasreturn_datas                        r/   __call__zPI0Processor.__call__R   s|   6 +**
6:nn6P6P
TZ
 < rsDdC 6D3F;~#d)+C/00CCI; Oe e 
 '}599:JDQo&**+;TB"%dN"; 	*FJ##d&;&;;c*oMNt~~OgOgNhiohpprs  !!&)		* %dnn^T}]7ST UnUU${{C,?+QY^YcYcd#kk#n*=PQSWS^S^`d`j`jk$-n$= 	Q E=,,,]r4rS`apSqrIm,K8< !457@7P|| 34	Q

/$8
 ||G,t/@/@@TEUEUX]E]^G}}R 4#5#55%%!T-?-?'--PRBS-S)TU%,\\"dootGYGY%ZK	"\\%(4??:t~~PU?UVE{{2!3!33ea););ekk"o)M%NO#(::b$2D2D#EK .IIr.   c                      t         |   dgz   S )Nrh   )rM   model_input_names)rO   rT   s    r/   r   zPI0Processor.model_input_names   s    w(,B+CCCr.   )NNN)r'   r(   r)   rN   r   listr   r   npndarrayr>   Tensorr   r1   r
   r   propertyr   __classcell__rT   s   @r/   rA   rA   @   s    5$ bf;?9=WJT*--T*5E0FFMWJ ++d9o=EV@WWZ^^WJ 

"U\\1D8	WJ
 bjj 5<</$6WJ +,WJ 
WJr D Dr.   rA   zlerobot/pi0_base)
checkpointc                       e Zd ZU dZdZeedZdZee	z  dz  e
d<   dZee	z  dz  e
d<   dZee
d<   d	Zee
d
<   d	Zee
d<   dZee
d<   dZee
d<   dZee
d<   dZee
d<   dZee
d<   dZee
d<   dZee
d<   dZee
d<    fdZd Z xZS )	PI0Configa  
    vlm_config (`dict`, *optional*):
        Configuration for the vlm backbone (PaliGemmaModel).
    dit_config (`dict`, *optional*):
        Configuration for the DiT backbone. Defaults to a Gemma 300M variant.
    chunk_size (`int`, *optional*, defaults to 50):
        Number of action steps to predict per chunk.
    max_state_dim (`int`, *optional*, defaults to 32):
        Maximum state vector dimension (shorter vectors are zero-padded).
    max_action_dim (`int`, *optional*, defaults to 32):
        Maximum action vector dimension (shorter vectors are zero-padded).
    num_inference_steps (`int`, *optional*, defaults to 10):
        Number of denoising steps during inference.
    time_sampling_beta_alpha (`float`, *optional*, defaults to 1.5):
        Alpha parameter for Beta distribution used to sample diffusion time during training.
    time_sampling_beta_beta (`float`, *optional*, defaults to 1.0):
        Beta parameter for Beta distribution used to sample diffusion time during training.
    time_sampling_scale (`float`, *optional*, defaults to 0.999):
        Scale factor for sampled time values.
    time_sampling_offset (`float`, *optional*, defaults to 0.001):
        Offset added to sampled time values.
    min_period (`float`, *optional*, defaults to 0.004):
        Minimum period for sinusoidal time embedding.
    max_period (`float`, *optional*, defaults to 4.0):
        Maximum period for sinusoidal time embedding.
    loss_reduction (`str`, *optional*, defaults to `"mean"`):
        The reduction to use on MSE loss.

    Example:
    ```python
    >>> from transformers import PI0ForConditionalGeneration, PI0Config

    >>> config = PI0Config()
    >>> model = PI0ForConditionalGeneration(config)
    ```
    pi0)
vlm_config
dit_configNr   r   rJ   rI   rH   rG   max_action_dim
   num_inference_stepsg      ?time_sampling_beta_alpha      ?time_sampling_beta_betag+?time_sampling_scalegMbP?time_sampling_offsetgMbp?
min_periodg      @
max_periodmeanloss_reductionc                    t        | j                  t              r:| j                  j                  dd      }t	        |   di | j                  | _        n5| j                  )t	        d   ddddddd	d
dddddddd	dd	dd	      | _        t        | j
                  t              r:| j
                  j                  dd      }t	        |   di | j
                  | _        nD| j
                  8t	        d   dddddd| j                  j                  j                        | _        d| j
                  _        d| j
                  _	        d| j                  j                  _	        t        | ,  di | y )N
model_type	paligemmagemmai      i @        i )r   hidden_sizenum_hidden_layersintermediate_sizenum_attention_headsnum_key_value_heads
vocab_sizesiglip_vision_modeli  i     r"         F)	r   r   r   
patch_size
image_sizer   r   r   vision_use_head)text_configvision_configprojection_dimimage_token_idi   i      )r   r   r   r   r   head_dimr   Tr-   )rp   r   dictrK   r   r   r   r   	is_causaluse_bidirectional_attentionrM   __post_init__)rO   rS   vlm_model_typedit_model_typerT   s       r/   r   zPI0Config.__post_init__   sX   doot,!__00{KN,^<OtODO__$,[9")#')+).+,+,"( #8)-#'"$"%)++-"(',
  $%-DO2 doot,!__00wGN,^<OtODO__$,W5 "$"&$%$%??66AADO %*!6:3BF##?''r.   c                     | j                   j                  dz  dk7  r-t        d| j                  j                   j                   d      y)zOPart of `@strict`-powered validation. Validates the architecture of the config.r   r   zDiT hidden dim=(z) must be divisible by 2N)r   r   rr   configrO   s    r/   validate_architecturezPI0Config.validate_architecture  sE    ??&&*a//0F0F0R0R/SSklmm 0r.   )r'   r(   r)   __doc__r   r   sub_configsr   r   r	   __annotations__r   rI   intrG   r   r   r   floatr   r   r   r   r   r   rq   r   r   r   r   s   @r/   r   r      s    #J J!+:FK15J''$.515J''$.5JM3NC!!&)e)%(U(!&&"'%'JJ NC 0(dnr.   r   block_boundariesrY   c           
      P     dt         dt         dt         dt         dt        f
 fd}|S )N	batch_idxhead_idxq_idxkv_idxrY   c                 f    t        j                  |      }t        j                  |      }||k  S r`   )r>   	bucketize)r   r   r   r   q_blockkv_blockr   s         r/   
inner_maskz0blockwise_bidirectional_mask.<locals>.inner_mask"  s0    //%)9:??6+;<7""r.   )r   r{   )r   r   s   ` r/   blockwise_bidirectional_maskr   !  s3    #c #S # #c #d #
 r.   c                   4     e Zd Z fdZed        Zd Z xZS )PI0TimestepEmbeddingsc                 z    t         |           || _        | j                  |      }| j	                  d|d       y )Nsinusoid_freqF)
persistent)rM   rN   r   compute_freqsregister_buffer)rO   r   r   rT   s      r/   rN   zPI0TimestepEmbeddings.__init__+  s:    **62_mNr.   c                    t        j                  dd| j                  j                  dz  t         j                        }| j
                  | j                  | j
                  z  |z  z  }d|z  dz  t        j                  z  }|S )N        r   r   re   )	r>   linspacer   r   float32r   r   mathpi)r   fractionperiodr   s       r/   r   z#PI0TimestepEmbeddings.compute_freqs1  sr    >>#sF,=,=,I,IQ,NV[VcVcd""f&7&7&:K:K&KPX%XXfq(4772r.   c                    t        |j                  j                  t              r/|j                  j                  dk7  r|j                  j                  nd}t	        |d      5  | j
                  d d d f   }||d d d f   z  }t        j                  |j                         |j                         gd      }d d d        |S # 1 sw Y   S xY w)NmpscpuF)device_typeenabledr   dim)
rp   devicetyperq   r   r   r>   catsincos)rO   timer   r   embtime_embedss         r/   forwardzPI0TimestepEmbeddings.forward8  s    *4T[[5E5Es*KPTP[P[P`P`diPidkk&&otUC 	C ..tQw7M$q$w-/C))SWWY	$:BK	C 		C s   #ACC)r'   r(   r)   rN   staticmethodr   r   r   r   s   @r/   r   r   *  s"    O  r.   r   c                   $     e Zd Z fdZd Z xZS )PI0ActionTimeEmbeddingc                 8   t         |           t        |      | _        t	        j
                  |j                  |j                  j                        | _	        t	        j
                  |j                  |j                  j                        | _        t	        j
                  d|j                  j                  z  |j                  j                        | _        t	        j
                  |j                  j                  |j                  j                        | _        y )Nr   )rM   rN   r   sinusoid_embedsr   Linearr   r   r   action_in_projrG   
state_projaction_time_mlp_inaction_time_mlp_outrO   r   rT   s     r/   rN   zPI0ActionTimeEmbedding.__init__B  s    4V< ii(=(=v?P?P?\?\]))F$8$8&:K:K:W:WX"$))A0A0A0M0M,MvO`O`OlOl"m#%99V->->-J-JFL]L]LiLi#j r.   c                    | j                  |      }| j                  |      }| j                  |      }|d d d d d f   j                  |      j	                  |j
                        }t        j                  ||gd      }| j                  t        j                  | j                  |                  }t        j                  |d d d d d f   |gd      }|S )Nre   r   r   r   )r  r  r  	expand_astorf   r>   r   r  r~   silur  )	rO   rX   noisetimestepstate_embedsaction_embedsr   action_time_embedsaction_embeds_mergeds	            r/   r   zPI0ActionTimeEmbedding.forwardJ  s    u-++E2**84!!T1*-77FIIP]PcPcId"YY{'CK!55affT=T=TUg=h6ij$yy,q$z*BDV)W]^_##r.   )r'   r(   r)   rN   r   r   r   s   @r/   r   r   A  s    k
$r.   r   c                   T     e Zd ZU eed<   dZdZdZdgZdZ	dZ
dZdZdZdZ fdZ xZS )PI0PreTrainedModelr   modelrX   Tpast_key_values)imagerV   c                     t         |   |       t        |t              r:t	        j
                  |j                  |j                  |j                               y y r`   )	rM   _init_weightsrp   r   initcopy_r   r   r   )rO   modulerT   s     r/   r  z PI0PreTrainedModel._init_weightse  sC    f%f34JJv++V-A-A&---PQ 5r.   )r'   r(   r)   r   r   base_model_prefixmain_input_namesupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendinput_modalitiesr  r   r   s   @r/   r  r  W  sR    O&*##4"5N!"&(R Rr.   r  c                   >    e Zd Zdef fdZd Zd ZddZee		 	 	 	 	 	 	 dde
j                  de
j                  dz  d	e
j                  dz  d
e
j                  dz  de
j                  dz  de
j                  dz  de
j                  dz  dedz  defd              Z xZS )PI0Modelr   c                     t         |   |       t        j                  |j                        | _        t        j                  |j                        | _        | j                          y r`   )	rM   rN   r   from_configr   ditr   vlm	post_initr  s     r/   rN   zPI0Model.__init__m  sJ     (():):;(():):;r.   c                 6    | j                   j                         S r`   )r+  get_input_embeddingsr   s    r/   r.  zPI0Model.get_input_embeddingss  s    xx,,..r.   c                 :    | j                   j                  |       y r`   )r+  set_input_embeddings)rO   values     r/   r0  zPI0Model.set_input_embeddingsv  s    %%e,r.   Nc                    |j                   d   }|j                  dd      }| j                  j                  |      j                  }|j                  d||j                   d   |j                   d         }g }t        |      D ]  \  }}	||   |	   }
|j                  |
         t        j                  |d      }|j                         }d||| j                  j                  j                  k(  <    | j                  j                         |      }|| j                  j                  j                  k(  j                  d      j!                  |      j#                  |j$                        }|j'                  ||      }|S )Nr   r   ri   r   r   )r}   flattenr+  get_image_featurespooler_outputreshaper|   rx   r>   r   cloner   r   r   r.  	unsqueezer	  r
  r   masked_scatter)rO   	input_idsrg   rh   attention_maskr   image_featurestotal_image_featuresr   maskunpadded_image_featuresllm_input_idsinputs_embedsspecial_image_masks                 r/   embed_prefixzPI0Model.embed_prefixy  sa   .44Q7#++Aq144\BPP'//O^EYEYZ[E\^l^r^rst^uv!()=> 	AOIt&4Y&?&E# ''(?@	A  %yy)=1E!)LMi4;;#9#9#H#HHI7557F$++00???Yr]Y}%R$$%	 	 &445GI]^r.   r  r:  rg   r;  rh   position_idsrA  r  rY   c	           	         |n|l|||j                  d      dz
  }|| j                  |||      }t        j                  |      dddddf   }
| j	                  ||||
d      j
                  }||j                  dk7  rt        d      dx}}|t        j                  |j                  d   |j                  d   |j                  |j                  	      }t        j                  ||gd
      }t        j                   |d
      dz
  dd|j                  d    df   }|j                         }t        j                  |dz   |j                  d   dz
  g|j                        }t        j                   |d
      dz
  }t        | j                   j"                  |||t%        |            } | j&                  d||||d|	}|S )z
        action_embeds (`torch.Tensor`, *optional*):
            The embeddings of input actions and robot states.
        pixel_attention_mask (`torch.Tensor`, *optional*):
            The mask indicating padded positions in the input image.
        Nri   r   r   T)rA  r;  rD  token_type_ids	use_cacher   z:Only two-dimensional attention masks are accepted for now!rf   r   r   )r   )r   rA  r;  r  and_mask_function)rA  r;  rD  r  r-   )cumsumrC  r>   
zeros_liker+  r  ndimrr   onesr}   rf   r   r   get_seq_lengthrL   r   r   r   r   r*  )rO   r  r:  rg   r;  rh   rD  rA  r  rS   rF  dit_position_idsdit_attention_mask
noise_maskvlm_input_lengthblock_sizesr   bidirectional_mask
dit_outputs                      r/   r   zPI0Model.forward  s   ( #(?)l.B-44R81<$ $ 1 1)\K_ `"--m<Q1WEN"hh+-)- '  o  %.*=*=*BYZZ 154-%##A&##A&$**%,,	J "'NJ+GQ!O %-?Q G! KQQ^QdQdefQgPgPiMij +99;ll$4q$8-:M:Ma:PST:T#U^k^r^rs <<;a?6;;))'-+:;KL
 TXX 
'-)+	

 

 r.   r`   )NNNNNNN)r'   r(   r)   r   rN   r.  r0  rC  r   r   r>   r   
LongTensorr   r   r   r   r   s   @r/   r'  r'  k  s    y /-2  *.,0.24804-1(,E||E <<$&E llT)	E
 t+E $llT1E &&-E ||d*E E 
!E  Er.   r'  c                       e Zd ZdZddiZdef fdZee	 	 	 	 	 	 	 	 	 	 dde	j                  de	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  dedz  de	j                  defd              Z e	j"                         	 	 	 	 dde	j                  d
e	j                  de	j                  de	j                  dz  de	j                  dz  de	j                  dz  dedz  de	j                  fd       Z xZS )PI0ForConditionalGenerationz9PI0 model with action projection heads and flow matching.action_out_projcolwise_gather_outputr   c                    t         |   |       t        |      | _        |j                  j
                  | _        t        |      | _        t        j                  | j                  |j                        | _        | j                          y r`   )rM   rN   r'  r  r   r   expert_hidden_sizer   embed_action_timer   r  r   rY  r,  r  s     r/   rN   z$PI0ForConditionalGeneration.__init__  sf     f%
"("3"3"?"?!7!?!yy)@)@&BWBWXr.   NrX   r  r  r:  rg   rh   r;  rD  rA  r  rW   rY   c                    |j                   d   }|t        j                  | j                  j                  t        j
                        }t        j                  | j                  j                  t        j
                        }t        j                  j                  ||      }|j                  |f      j                  |j                        }|| j                  j                  z  | j                  j                  z   j                         }|Vt        j                  || j                  j                   | j                  j"                  |j                  |j$                        }|7|ddddf   }||z  d|z
  |z  z   j                  |j$                        }||z
  }n|}| j'                  |||      } | j(                  d	||||||	||
d|}|j*                  dd| j                  j                    df   }| j-                  |      }d}|,t/        j0                  || j                  j2                        }t5        |||j6                  |j8                  |j:                        S )
a-  
        state (`torch.Tensor`, *optional*):
            Current robot state.
        noise (`torch.Tensor`, *optional*):
            Random noise at current timestep that needs to be denoised
        timestep (`torch.Tensor`, *optional*):
            Current denoising timestep.
        pixel_attention_mask (`torch.Tensor`, *optional*):
            The mask indicating padded positions in the input image.
        actions (`torch.Tensor`, *optional*):
            Input actions that need to be predicted. Used only when training to compiute loss.
        r   Nre   )r   rf   r   )r:  rg   r;  rh   rD  rA  r  r  )	reduction)losslogitsr  hidden_states
attentionsr-   )r}   r>   rL   r   r   r   r   distributionsBetar   r
  r   r   r   r   randnrI   r   rf   r]  r  last_hidden_staterY  r~   mse_lossr   r   r  rb  rc  )rO   rX   r  r  r:  rg   rh   r;  rD  rA  r  rW   rS   
batch_sizealpha_tbeta_tdist	time_betatime_expandednoisy_actionstarget_velocityr  outputslast_hidden_statespredicted_velocityr`  s                             r/   r   z#PI0ForConditionalGeneration.forward  s'   : [[^
 ll4;;#G#Gu}}]G\\$++"E"EU]][F&&++GV<DZM255ellCI!DKK$C$CCdkkFfFffmmoH =KK&&**||kkE $Qd]3M*U2a-6G75RRVVW^WdWdeM#goO!M "33E=(S$** 

%)!5%',+

 

 %66q4;;;Q;Q:Q:S7ST!112DE::o/AT[[MgMghD%%#33!//))
 	
r.   	num_stepsc           	         |xs | j                   j                  }|j                  d   }	|j                  }
|Ot	        j
                  dd|	| j                   j                  | j                   j                  f|j                  |
      }||j                  d      dz
  }| j                  j                  |||      }| j                  j                  ||dd      j                  }|j                         }d	|z  }t        |      D ]p  }d||z  z   }t	        j                   |t        j"                  |

      j%                  |	      } | ||||||      }|j'                  |       |||j(                  z  z   }r |S )z0Run flow matching inference to generate actions.r   r   r   )r   stdr*   rf   r   ri   r   T)rA  r;  rD  rG  return_dictg      rH  )rX   r  r  rh   r;  r  )r   r   r}   r   r>   normalrI   r   rf   rJ  r  rC  r+  r  rN  rangerL   r   expandcropra  )rO   rX   r:  rg   r  r;  rh   rt  rS   ri  r   rD  rA  r  prefix_lengthdtstepr   time_tensoroutputs                       r/   sample_actionsz*PI0ForConditionalGeneration.sample_actionsA  s    @!@!@	__Q'
!! =LLKK**KK..
 #((
E %)004q8L

//	<I]^**..')% ) 
 / 	 (668 I)$ 	/D?D,,t5==PWWXbcK$%9- /F   /B..E	/ r.   )
NNNNNNNNNN)NNNN)r'   r(   r)   r   _tp_planr   rN   r   r   r>   FloatTensorr   
BoolTensorrV  r   r   r   no_gradr   r  r   r   s   @r/   rX  rX    s   C!#:;Hy   +/-1)-,08<.204-1(,%)T
  T
   4'T
 ##d*	T

 <<$&T
 llT)T
 $..5T
 t+T
 &&-T
 ||d*T
 T
 ""T
 
 T
  T
l U]]_ +/.28< $=  = ##= ''	=
   4'= t+= $..5= := 
		= =r.   rX  )r   r  r'  rX  rA   r!   )Fr   r   collections.abcr   numpyr   r>   torch.nn.functionalr   
functionalr~   huggingface_hub.dataclassesr   r\   r   r  cache_utilsr   configuration_utilsr	   feature_extraction_utilsr
   image_utilsr   r   masking_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   r   tokenization_utils_baser   r   utilsr   r   r   utils.genericr   utils.import_utilsr   autor   r   r   paligemma.processing_paligemmar   siglip.image_processing_siglipr   
get_loggerr'   rn   r!   r1   rA   r   r   r   Moduler   r   r  r'  rX  __all__r-   r.   r/   <module>r     s   [  $     .  &   3 4 A 6 O - 8 C > > + * 8 8 ? A 
		H	% ,  )  	&'kD% kD ( kD\ -.ln  ln  /ln^5<< H BII .$RYY $, R R R& m! m m`c"4 cLr.   