
    i@                     *   d dl Z d dlZddlmZ ddlmZ ddlmZm	Z	 ddl
mZmZmZmZ ddlmZmZ ddlmZmZmZ dd	lmZ dd
lmZ  e       rddlmZmZ  ej:                  e      Z G d ded      Z e ed       G d de                    Z!dgZ"y)    N   )
AudioInput)BatchFeature)
ImageInputmake_nested_list_of_images)MultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)auto_docstringis_vision_availablelogging)requires)
VideoInput   )Gemma4ImageProcessorKwargs get_aspect_ratio_preserving_sizec                   4    e Zd ZU eed<   dddddii ddidZy)Gemma4ProcessorKwargsimages_kwargsT)paddingreturn_mm_token_type_idsdo_convert_rgbreturn_metadata)text_kwargsr   audio_kwargsvideos_kwargsN)__name__
__module____qualname__r   __annotations__	_defaults     }/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/gemma4/processing_gemma4.pyr   r   $   s5    -- (,

 d
 +T2
Ir&   r   F)total)vision)backendsc                        e Zd Z	 	 	 	 ddededef fdZe	 	 	 	 ddedz  deez  e	e   z  e	e   z  de
dz  d	edz  d
ee   defd       ZdedefdZddZe fd       Z xZS )Gemma4ProcessorNimage_seq_lengthaudio_seq_lengthaudio_ms_per_tokenc	           	         || _         |j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  ddgi       d| _        |j                  | j                        | _        || _	        || _
        t        |dd      | _        t        |dd      | _        t        |dd      | _        t        |dd      | _        t!        
| D  d	|||||d|	 y)
u  
        image_seq_length (`int`, *optional*, defaults to 280):
            The number of soft tokens per image used for placeholder expansion.
        audio_seq_length (`int`, *optional*, defaults to 750):
            The maximum number of audio soft tokens per audio segment. Serves as an
            upper-bound cap when dynamic audio token counts are computed.
        audio_ms_per_token (`int`, *optional*, defaults to 40):
            Milliseconds of audio per output soft token. Used to dynamically compute
            the number of audio placeholder tokens as ``ceil(duration_ms / audio_ms_per_token)``.
            The default of 40 comes from the SSCP convolution's 4× time reduction on 10ms frames.
        additional_special_tokensz	<|video|>audio_token_idNaudio_token	boa_token	eoa_token)feature_extractorimage_processor	tokenizervideo_processorchat_templater%   )r-   image_token_id	boi_token	eoi_tokenimage_tokenadd_special_tokensvideo_tokenconvert_tokens_to_idsvideo_token_idr.   r/   getattrr2   r3   r4   r5   super__init__)selfr6   r7   r8   r9   r:   r-   r.   r/   kwargs	__class__s             r'   rE   zGemma4Processor.__init__6   s    . !1'66",,",,$00 	$$&AK=%QR&'==d>N>NO !1 #5%i1A4H"9mTB K> K> 	
/++'	
 	
r&   imagestextaudiovideosrG   returnc                 J   ! ||||t        d       | j                  t        fd| j                  j                  i|}t        |t              r|g}n.t        |t              st        |d   t              st        d      i }|F| j                  j                  |      }t        |      } | j                  |fi |d   }|j                  d      }	|s5|D cg c]*  }dj                  | j                  gt        |      z        , }}t        |      t        |      k7  r$t        dt        |       d	t        |       d
      |	D 
cg c]+  }
| j                    | j                  |
z   | j"                   - }}
t%        |       t'        j(                  | j                        }|D cg c]  }t'        j*                  | fd|       }}i }| | j,                  d'd|i|d   }|j                  d      }|j/                  d      s|j                  d      }n|d   }g !t1        ||      D ]  \  }}|j2                  t4        j7                  d       |j2                  dn|j2                  |_        |j8                  D cg c]#  }t;        |dz        ddt;        |dz        d% }}!j=                  dj                  |D cg c].  }| d| j                    | j>                  |z   | j"                   0 c}              t%        !      !t'        j(                  | j>                        }|D cg c]  }t'        j*                  |!fd|       }}i }|W| j@                  | jB                  | jD                  t        d      t        |tF        jH                        r|jJ                  dk(  r|g}|s| j@                  gt        |      z  }|j/                  di       } | jL                  |fi |}| jL                  jN                  }|D cg c]  }| jQ                  ||       }}|D 
cg c]+  }
| jB                   | j@                  |
z   | jD                   - }}
t%        |       t'        j(                  | j@                        }|D cg c]  }t'        j*                  | fd|       }}|d   j                  dd       }|d   j                  dd      } | j                  d'd|i|d   }g }|j=                  d        ||j=                  d!       ||j=                  d"       |r| jS                  |||#       |r| jU                  |d$         |d%<   tW        i |||||&      S c c}w c c}
w c c}w c c}w c c}w c c}w c c}w c c}
w c c}w )(Nz?Provide at least one of `text`, `images`, `audio`, or `videos`.tokenizer_init_kwargsr   zAInvalid input text. Please provide a string, or a list of stringsr   num_soft_tokens_per_image z1Received inconsistently sized batches of images (z) and text (z).c                     t              S Nnext_replacements_iters    r'   <lambda>z*Gemma4Processor.__call__.<locals>.<lambda>   s    d3D.E r&   rL   r   num_soft_tokens_per_videor   video_metadataa  Gemma 4 requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results.   <   02d:c                     t              S rS   rT   )rW   video_replacementss    r'   rY   z*Gemma4Processor.__call__.<locals>.<lambda>   s    d3E.F r&   zUAudio inputs were provided, but the tokenizer does not have an `audio_token` defined.r   r   c                     t              S rS   rT   rV   s    r'   rY   z*Gemma4Processor.__call__.<locals>.<lambda>   s    D9J4K r&   r   return_tensorsr   FrJ   imagevideorK   )
modalities	input_idsmm_token_type_ids)datatensor_typer%   ),
ValueError_merge_kwargsr   r8   init_kwargs
isinstancestrlist	TypeErrorr7   fetch_imagesr   popjoinr>   lenr<   r=   iterreescapesubr9   getzipfpsloggerwarning_once
timestampsintappendr@   r3   r4   r5   npndarrayndimr6   sampling_rate_compute_audio_num_tokens_check_special_mm_tokenscreate_mm_token_type_idsr   )"rF   rI   rJ   rK   rL   rG   output_kwargsimage_inputsbatched_imagesnum_soft_tokensnreplacementspatternpromptvideo_inputsnum_video_tokensr[   metadatan_tokenssecondstimestamp_strtaudio_inputsr   r   anum_audio_tokensaudio_patternrc   r   text_inputsactive_modalitiesrX   ra   s"                                   @@r'   __call__zGemma4Processor.__call__n   s    <FNu}^__***!
"&.."<"<
 
 dC 6DD$'
47C0H_``))66v>F7?N/4//Y-:XYL*../JKO Q_`v$"2"2!3c&k!AB``>"c$i/ GNH[G\\hilmqirhssuv  `ooZ[t~~.t/?/?!/C.DT^^DTUoLo $\ 2 ii 0 01G]abSYBFF7$EvNbDb /4//`v`A_`L+//0KL ::/0!-!1!12B!C!-.>!?!#&).:J&K "(<<''', &.\\%9rx|| X`WjWj!LSs7b=)#.aGbL0A#/FG! ! #))HHgtubcA3a/0@0@80K/LT^^L\]u& "&&8!9ii 0 01G^bcTZBFF7$FOcDc '4>>+AT^^E[ k 
 %,q (()CJ6 ),,^R@L1411%H<HL 22@@MZ_`UV > >q- P``_opZ[t~~.t/?/?!/C.DT^^DTUpLp $\ 2IId&6&67McghY_BFF=*KVThDh&}599:JDQ#0#?#C#CD^`e#f $dnnO$O-2NO $$W-$$W-$$W-))$HY)Z#/3/L/L[YdMe/fK+,PKP<P<P<P&
 	
G a p c2!
 v d4  ap is6   /U80U=-!V(V3V9!VV50V!V r   c                 0   t        |      }t        t        |dz  dz              }t        t        |dz  dz              }|dz   }|dz  }||z   }||z
  |z  dz   }	|	dk  ry|	}
t        d      D ]  }|
dz   }|dz
  dz  dz   }
 t	        |
| j
                        S )aQ  Compute the number of audio soft tokens for a single waveform.

        Replicates the exact sequence-length arithmetic of the audio encoder
        so that the processor inserts the correct number of placeholder tokens.
        The computation mirrors:

        1. Mel framing via ``_unfold`` in ``Gemma4AudioFeatureExtractor``
        2. Two ``Conv2d`` subsampling layers in ``Gemma4AudioSubSampleConvProjection``
           (each: kernel=3, stride=2, semicausal padding top=1, bottom=1)

        The result is capped at ``self.audio_seq_length`` (the configured maximum).

        Args:
            audio_waveform: A 1-D numpy array or list containing the raw audio samples.
            sampling_rate: The sampling rate of the audio waveform in Hz.

        Returns:
            The number of audio soft tokens to insert as placeholders.
        g      4@g     @@g      $@r      r   r   )ru   r   roundrangeminr.   )rF   audio_waveformr   num_samplesframe_length
hop_lengthframe_size_for_unfoldpad_leftpadded_samplesnum_mel_framesr   rW   t_paddeds                r'   r   z)Gemma4Processor._compute_audio_num_tokens   s    ( .) 5!5!>?@}t3f<=>
 ,q 0
  1$$x/(+@@ZORSSQ q 	(A1uHA!#a'A	(
 1d++,,r&   c                 &   t         j                  j                  di       }|j                  |       |j                  dd      xs | j                  j
                  }|j                  dd      xs | j                  j                  }|j                  dd      xs | j                  j                  }||dz  z  }i }	|ig }
|D ]?  }t        |d   |d   |||	      \  }}||z  }||z  }|
j                  ||z  |dz  z         A dgt        |      z  }|	j                  |
|d
       |\t        | j                  dd      }|D cg c]'  }| j                  t        j                  |      |      ) }}|	j                  d|i       t!        di |	S c c}w )av  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.
            audio_lengths (`list[int]`, *optional*):
                The lengths of audio inputs in number of samples. Used to dynamically
                compute per-audio token counts.

        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        r   
patch_sizeNpooling_kernel_sizemax_soft_tokensr   r   r   )heightwidthr   max_patchesr   )num_image_tokensnum_image_patchesr   i>  r   r%   )r   r$   rz   updater7   r   r   r   r   r   ru   rC   r6   r   r   zerosr   )rF   image_sizesaudio_lengthsrG   r   r   r   r   r   vision_datar   
image_sizetarget_htarget_wpatch_heightpatch_widthr   r   lengthr   s                       r'   _get_num_multimodal_tokensz*Gemma4Processor._get_num_multimodal_tokens&  s     .77;;ORPV$"&&|T:]d>R>R>]>]
3T:fd>R>R>f>f 	 (++,=tDlH\H\HlHl%(;Q(>>"!) 
^
%E%a=$Q-) +(;&"(  (:5&*4 ''{(BFY[\F\(\]
^ "#c+&6 64D[lmn$ $D$:$:OVTM^k TZ..rxx/?O     24DEF,,, s   ,Fc                     t         |   }|D cg c]  }|dvr|
 }}| j                  :| j                  j                  }|j                  |D cg c]	  }||vs| c}       |dgz   S c c}w c c}w )N)rP   rZ   rh   )rD   model_input_namesr6   extend)rF   r   namefeature_extractor_input_namesrH   s       r'   r   z!Gemma4Processor.model_input_names]  s    !G5 *
UU 
 
 !!-,0,B,B,T,T)$$7T%vtX\duXud%vw $7#888
 &ws   A+	A0A0)Ni  i  (   )NNNN)NN)r    r!   r"   r   rE   r   r   r   r   rp   r   r   r   r   r   r   r   r   propertyr   __classcell__)rH   s   @r'   r,   r,   3   s      # #"$6
 6
 6
  6
p  %)Z^#'$(F
T!F
 ++d9o=EV@WWF
 D 	F

 T!F
 ./F
 
F
 F
P--s --s --^5-n 9 9r&   r,   )#rw   numpyr   audio_utilsr   image_processing_utilsr   image_utilsr   r   processing_utilsr   r	   r
   r   tokenization_utils_baser   r   utilsr   r   r   utils.import_utilsr   video_utilsr   image_processing_pil_gemma4r   r   
get_loggerr    r}   r   r,   __all__r%   r&   r'   <module>r      s    
  % 2 A X X C A A * % i 
		H	%,E  	;v9n v9   v9r	 
r&   