
    i$                        d Z ddlZddlmZ ddlmZmZ ddlm	Z	m
Z
mZmZ ddlmZmZ ddlmZmZmZ dd	lmZ  e       rd
dlmZ  ej0                  e      Z G d de
d      ZdefdZd Ze ed       G d de                    ZdgZ y)z
Processor class for Pixtral.
    N   )BatchFeature)
ImageInputis_valid_image)MultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)auto_docstringis_vision_availablelogging)requires   )get_resize_output_image_sizec                        e Zd ZdddddidZy)PixtralProcessorKwargsF)paddingreturn_mm_token_type_idsreturn_tensorspt)text_kwargscommon_kwargsN)__name__
__module____qualname__	_defaults     /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/pixtral/processing_pixtral.pyr   r   (   s"     (-

 d
Ir    r   F)totalreturnc                 H    t        | t              xr | j                  d      S )Nhttp)
isinstancestr
startswith)vals    r!   is_urlr*   5   s    c3:CNN6$::r    c                 2    t        |       xs t        |       S N)r*   r   )elems    r!   is_image_or_image_urlr.   :   s    $</>$//r    )torchvisiontorch)backendsc            
            e Zd Z	 	 	 	 	 	 	 	 ddedef fdZe	 	 ddedz  deez  e	e   z  e	e   z  de
e   defd	       Zdd
Zed        Z xZS )PixtralProcessorN
patch_sizespatial_merge_sizec	                    t         
|   |||       || _        || _        || _        |j                  | j                        | _        || _        || _        |j                  | j                        | _        |j                  | j                        | _	        |j                  | j                        | _
        | j                  | j                  | j                  g| _        y)a  
        patch_size (`int`, *optional*, defaults to 16):
            Patch size from the vision tower.
        spatial_merge_size (`int`, *optional*, defaults to 1):
            The downsampling factor for the spatial merge operation.
        image_token (`str`, *optional*, defaults to `"[IMG]"`):
            Special token used to denote image location.
        image_break_token (`str`, *optional*, defaults to `"[IMG_BREAK]"`):
            Special token used to denote the end of a line of pixels in an image.
        image_end_token (`str`, *optional*, defaults to `"[IMG_END]"`):
            Special token used to denote the end of an image input.
        )chat_templateN)super__init__r4   r5   image_tokenconvert_tokens_to_idsimage_token_idimage_break_tokenimage_end_tokenimage_break_token_idimage_end_token_id	image_ids)selfimage_processor	tokenizerr4   r5   r7   r:   r=   r>   kwargs	__class__s             r!   r9   zPixtralProcessor.__init__A   s    0 	)=Q$"4&'==d>N>NO!2.'==d>N>NO$-$C$CDDZDZ$[!"+"A"A$BVBV"W--t/H/H$JaJabr    imagestextrE   r#   c           	          | j                   t        fdt        | j                  di       i|}| j                  | j
                  z  }|||d   d<    | j                  |fi |d   }ni }t        |t              r|g}n.t        |t              st        |d   t              st        d      |}|j                  d      t        |d	         }g }g }	|D ]  }
| j                  |
v rt        |      \  }}||z  }||z  }| j                  g|z  | j                  gz   g|z  }|D cg c]  }|D ]  }|  }}}| j                   |d
<   dj#                  |      }|	j%                  |       |
j'                  | j                  dd      }
| j                  |
v rd|
v r)|	j)                  d      }|
j'                  d|d      }
d|
v r)|j%                  |
        |d   j)                  dd      }|d   j)                  dd      }|d   j)                  dd        | j                  |fi |d   ddi}| j+                  ||dg       |r| j-                  |d         |d<   t/        i |||      S c c}}w )a?  
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
            `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        tokenizer_init_kwargsinit_kwargsNimages_kwargsr4   r   zAInvalid input text. Please provide a string, or a list of stringspixel_valuesimage_sizes z<placeholder>r   r   r   r   Freturn_token_type_idsimage)
modalities	input_idsmm_token_type_ids)datatensor_type)_merge_kwargsr   getattrrD   r4   r5   rC   r&   r'   list	TypeErrorgetiterr:   nextr=   r>   joinappendreplacepop_check_special_mm_tokenscreate_mm_token_type_idsr   )rB   rG   rH   rE   output_kwargsr4   image_inputsprompt_stringsrN   replace_stringssampleheightwidthnum_height_tokensnum_width_tokensreplace_tokenssublistitemreplace_strr   r   text_inputss                         r!   __call__zPixtralProcessor.__call__f   s   $ +**"
")$..-"L
 
 __t'>'>>
;EM/*<8/4//Y-:XYLLdC 6DD$'
47C0H_`` N+7|M:;KN O .&&&0$($5MFE(.*(<%',
':$))*-==AWAW@XX&)&*N ;I%]wU\%]Td%]d%]N%])-)=)=N2&"$''."9K#**;7#^^D,<,<oqQF &&&0 &/"1"5"5a"8K#^^O[!LF &/ %%f-%.( '}599:JDQ#0#?#C#CD^`e#f m$(()@$G$dnn^i}]7Sidhi%%nkwi%X#/3/L/L[YdMe/fK+,!@K!@<!@n]]) &^s   %I-c                 
   i }|t         j                  j                  di       }|j                  |       |j                  dd      xs | j                  j
                  }| j                  | j                  z  }g }|D ]W  \  }}	t        t        j                  ||	df      |d   |d   f||f      \  }
}|
|z  }||z  }|j                  |dz   |z         Y dgt        |      z  }|j                  ||d       t        d	i |S )
a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.

        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        NrL   sizer   longest_edge)ru   r4   r   )num_image_tokensnum_image_patchesr   )r   r   r\   updaterC   ru   r4   r5   r   npzerosr`   lenr   )rB   rN   rE   vision_datarL   ru   r4   rw   rj   rk   resized_heightresized_widthrl   rm   rx   s                  r!   _get_num_multimodal_tokensz+PixtralProcessor._get_num_multimodal_tokens   s-    "2<<@@RTUM  ( $$VT2Od6J6J6O6OD4+B+BBJ!!, T0LHHfeQ/0~.^0DE *J71-
 %3j$@!#0J#>  '')9A)=AR(RST "#c+&6 64D[lmn,,,r    c                 l    | j                   j                  }| j                  j                  }||z   dgz   S )NrN   )rD   model_input_namesrC   )rB   tokenizer_input_namesimage_processor_input_namess      r!   r   z"PixtralProcessor.model_input_names   s7     $ @ @&*&:&:&L&L#$'BBm_TTr    )NN   r   Nz[IMG]z[IMG_BREAK]z	[IMG_END])NNr,   )r   r   r   intr9   r   r   r   r   rZ   r
   r   r   rs   r   propertyr   __classcell__)rF   s   @r!   r3   r3   >   s    
 "#'##c 	#c
  #cJ  %)Z^I^T!I^ ++d9o=EV@WWI^ /0	I^
 
I^ I^V"-H U Ur    r3   )!__doc__numpyrz   feature_extraction_utilsr   image_utilsr   r   processing_utilsr   r   r	   r
   tokenization_utils_baser   r   utilsr   r   r   utils.import_utilsr   image_processing_pixtralr   
get_loggerr   loggerr   boolr*   r.   r3   __all__r   r    r!   <module>r      s     4 5  D A A * F 
		H	%	-U 	;4 ;
0 	+,ZU~ ZU - ZUz 
r    