
    i8'                        d dl Zd dlZd dlmc mZ ddlmZ ddl	m
Z
mZ ddlmZmZmZmZ ddlmZmZmZ ddlmZmZ ddlmZ  ej4                  e      Z G d	 d
ed      ZdZ ed      D  cg c]	  } d| dd c}  ed      D  cg c]	  } d| dd c} z   Z e ed       G d de                    Z!dgZ"yc c} w c c} w )    N   )BatchFeature)
ImageInputmake_nested_list_of_images)MultiModalDataProcessingKwargsProcessorMixinUnpack)
AddedTokenPreTokenizedInput	TextInput)auto_docstringlogging)requiresc                   "    e Zd ZddddddidZy)	PI0ProcessorKwargs
max_length0   right)paddingr   padding_sidereturn_tensorspt)text_kwargscommon_kwargsN)__name__
__module____qualname__	_defaults     w/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/pi0/processing_pi0.pyr   r   $   s#     $#

 +D1Ir!   r   F)totalz<image>i   z<locz0>4>   z<segz0>3)visiontorch)backendsc                   0    e Zd Zd fd	Ze	 	 	 ddeee   z  eee      z  dz  deez  ee   z  ee   z  dz  dee	j                  z  ej                  z  dz  dee	j                  z  ej                  z  dz  dee   defd	       Zdd
Ze fd       Z xZS )PI0ProcessorNc                    |j                   d   |j                   d   c| _        | _        |j                  dg d      }|j                  dg d      }|j                  dg d      }|j                  d	g d
      }t	        j
                  |      | _        t	        j
                  |      | _        t	        j
                  |      | _        t	        j
                  |      | _	        |j                  dd      | _
        |j                  dd      | _        t        |d      st        d      |j                  | _        t        |d      sNt        t         dd      }	d|	gi}
|j#                  |
       |j%                  t               | _        t         | _        n"|j&                  | _        |j(                  | _        |j+                  t,               d|_        d|_        t2        | i  |||       y )Nheightwidth
state_mean)ggsgr?g	h"l?gW2D@g\ AcgZd;OſgB>٬?gQI	state_std)gt$~?gL
F%u?g.!u?g/n?g6?gx?g]K=?gF%u?actions_mean)g&S?gX ?gW[재gHPsr?gg?g 	gHPsactions_std)gGz?g`"?g9#J{?gvOjM?g>yX5ͫ?g46<R?gj+?max_state_dim    
chunk_size2   image_seq_lengthz;Image processor is missing an `image_seq_length` attribute.image_tokenFT)
normalizedspecialadditional_special_tokens)chat_template)sizer,   r-   getr'   tensorr.   r/   r0   r1   r2   r4   hasattr
ValueErrorr6   r   IMAGE_TOKENadd_special_tokensconvert_tokens_to_idsimage_token_idr7   
add_tokensEXTRA_TOKENSadd_bos_tokenadd_eos_tokensuper__init__)selfimage_processor	tokenizerr;   kwargsr.   r/   r0   r1   r7   tokens_to_add	__class__s              r"   rJ   zPI0Processor.__init__6   s   "1"6"6x"@/BVBVW^B_TZZZ.rs
JJ{,lm	zz.2mnjj0hi,,z2i0!LL6 <<4#ZZ< **\26(:;Z[[ / @ @y-0$[UDQK8;-HM((7"+"A"A+"ND*D"+":":D(44D\*"'	"'	)=Qr!   imagestextactionsstaterN   returnc                     | j                   t        fd| j                  j                  i|}|t        j                  d       d}t        |t              r|g}t        |      }t        |      t        |      k7  r$t        dt        |       dt        |       d      |d   j                  d	d      }|d
   j                  d	d       g }	t        ||      D ]V  \  }
}| j                  | j                  z  t        |      z   | j                  j                   |
 d}
|	j!                  |
       X  | j                  |	fi |d   }t#        d |D              }t%        j&                  t        |      |ft$        j(                        }t%        j&                  t        |      |d| j*                  | j,                        }t/        |      D ]=  \  }} | j0                  |fd	di|d
   }t        |      }d||d|f<   |d   ||d|f<   ? i |||d}|t%        j2                  |      | j4                  z
  | j6                  dz   z  }|j8                  d   | j:                  k  r2t=        j>                  |d| j:                  |j8                  d   z
  f      }|jA                  d| jB                  | j:                        |d<   |t%        j2                  |      | jD                  z
  | jF                  dz   z  }|j8                  d   | j:                  k  r2t=        j>                  |d| j:                  |j8                  d   z
  f      }|jA                  d| j:                        |d<   tI        ||      S )aC  
        actions (`list | np.ndarray | torch.Tensor`, *optional*):
            Actions to be predicted by the model. If provided, padding, mean and std normalization will be applied.
        state (`list | np.ndarray | torch.Tensor`, *optional*):
            Robotic states to be predicted by the model. If provided, padding, mean and std normalization will be applied.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix`
              is provided, the `input_ids` will also contain the suffix input ids.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
            - **pixel_attention_mask** -- Pixel values padding mask to be fed to a model. Returned when `images` is not `None`.
            - **state** -- Robot state compatible with model if `state` is not None
            - **actions** -- Label-actions compatible with training if `actions` is not None
        tokenizer_init_kwargsNzPYou are using PI0 without a text prefix. The processor will use an empty prompt. z	Received z image samples for z\ prompts. Each prompt should be associated with one sample (with one or more camera images).r   r   images_kwargs
c              3   2   K   | ]  }t        |        y wN)len).0sample_imagess     r"   	<genexpr>z(PI0Processor.__call__.<locals>.<genexpr>   s     U]c-0Us   )dtyper   r   Tpixel_values)rb   pixel_attention_maskg:0yE>r   rS   rT   )datatensor_type)%_merge_kwargsr   rM   init_kwargsloggerwarning_once
isinstancestrr   r]   r@   popzipr7   r6   	bos_tokenappendmaxr'   zerosboolr,   r-   	enumeraterL   r>   r0   r1   shaper2   Fpadviewr4   r.   r/   r   )rK   rQ   rR   rS   rT   rN   output_kwargsbatched_imagesr   prompt_stringssample
image_listtext_inputsmax_num_camerasrc   padded_pixel_valuesbatchr_   	processednum_camerasreturn_datas                        r"   __call__zPI0Processor.__call__X   s|   8 +**
6:nn6P6P
TZ
 < rsDdC 6D3F;~#d)+C/00CCI; Oe e 
 '}599:JDQo&**+;TB"%dN"; 	*FJ##d&;&;;c*oMNt~~OgOgNhiohpprs  !!&)		* %dnn^T}]7ST UnUU${{C,?+QY^YcYcd#kk#n*=PQSWS^S^`d`j`jk$-n$= 	Q E=,,,]r4rS`apSqrIm,K8< !457@7P|| 34	Q

/$8
 ||G,t/@/@@TEUEUX]E]^G}}R 4#5#55%%!T-?-?'--PRBS-S)TU%,\\"dootGYGY%ZK	"\\%(4??:t~~PU?UVE{{2!3!33ea););ekk"o)M%NO#(::b$2D2D#EK .IIr!   c                     i }|<| j                   gt        |      z  }dgt        |      z  }|j                  ||d       t        di |S )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (list[list[str]], *optional*):
                The input sizes formatted as (height, width) per each image.
        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
           )num_image_tokensnum_image_patchesr    )r6   r]   updater   )rK   image_sizesrN   vision_datar   r   s         r"   _get_num_multimodal_tokensz'PI0Processor._get_num_multimodal_tokens   s]     " $ 5 56[9II!"c+&6 64D[lmn,,,r!   c                      t         |   dgz   S )Nrc   )rI   model_input_names)rK   rP   s    r"   r   zPI0Processor.model_input_names   s    w(,B+CCCr!   )NNNr\   )r   r   r   rJ   r   r   listr   r   npndarrayr'   Tensorr
   r   r   r   r   propertyr   __classcell__)rP   s   @r"   r*   r*   3   s    RD  bf;?9=WJT*--T*5E0FFMWJ ++d9o=EV@WWZ^^WJ 

"U\\1D8	WJ
 bjj 5<</$6WJ +,WJ 
WJ WJr-$ D Dr!   r*   )#numpyr   r'   torch.nn.functionalnn
functionalrv   feature_extraction_utilsr   image_utilsr   r   processing_utilsr   r   r	   r
   tokenization_utils_baser   r   r   utilsr   r   utils.import_utilsr   
get_loggerr   ri   r   rA   rangerF   r*   __all__)is   0r"   <module>r      s   *     4 A X X O O , * 
		H	%)  ).t5A$qgQ5RWX[R\8]Q4#wa8]] 	&'QD> QD ( QDh 
s 68]s   /B<	C