
    #iwj                    z   d Z ddlmZ ddlZddlZddlmZ ddlmZm	Z	 ddl
mZ ddlZddlZddlmZmZmZmZ 	 ddlmZ 	 dd	lmZmZ  ej8                  e      Zd
d
d
d
dZddZ ddZ!ddZ"ddZ#ddZ$ddZ% G d d      Z&	 d	 	 	 	 	 ddZ'	 d	 	 	 	 	 ddZ(ddZ)y# e$ r dZY hw xY w# eef$ r dZdZY pw xY w)zAModality detection, input parsing, and message format conversion.    )annotationsN)defaultdict)AnyLiteral)urlparse)MessageFormatModality	PairInputSingleInput)Image)AudioDecoderVideoDecoderflat)apertusdeepseek_v3gpt_ossseed_ossc                0    | j                  d      xr d| vS )zQCheck if a string looks like a valid URL (starts with http(s) and has no spaces).)zhttp://zhttps:// )
startswithtexts    t/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/sentence_transformers/base/modality.py_looks_like_urlr   ,   s    ??23G4G    c                    t        |       r4t        |       j                  j                         }|j	                  |      S | j                         j	                  |      xr t
        j                  j                  |       S )zOCheck if a string is a URL or local file path with one of the given extensions.)r   r   pathlowerendswithosisfile)r   
extensionsr   s      r   _is_media_url_or_pathr#   1   sZ    t~""((*}}Z((::<  ,E1EEr   c                >    | j                  d      ryt        | d      S )z:Check if a string is an image URL, file path, or data URI.zdata:image/T)z.jpgz.jpegz.pngz.gifz.bmpz.tiffz.webp)r   r#   r   s    r   is_image_url_or_pathr%   9   s    }% 'bccr   c                d    t        | d      ryt        |       xr t        |       j                  dv S )z.Check if a string is a video URL or file path.)z.mp4z.aviz.movz.wmvz.flvz.mkvT)zwww.youtube.comzyoutube.comzyoutu.bezm.youtube.com)r#   r   r   netlocr   s    r   is_video_url_or_pathr(   @   s8    T#ST4  Xd^%:%: ? & r   c                    t        | d      S )z/Check if a string is an audio URL or file path.)z.mp3z.wavz.oggz.flacz.aac)r#   r   s    r   is_audio_url_or_pathr*   L   s     'PQQr   c                0   t        | t        t        f      rt        |       dk7  ryt        | d   t              rt        | d   t              ry| D ]E  }t        |t
              r
d|v rd|v r yt        |t              s.|s1t        |d   t
              sE y y)a  Check if a sample is a non-text pair (2-element tuple/list with at least one non-string element).

    Text pairs ``(str, str)`` are handled natively by tokenizers and detected as ``"text"`` modality
    by :func:`infer_modality`. This helper detects pairs that contain at least one non-string element
    (e.g. an image, audio array, or dict), which require conversion to message format.
       Fr      rolecontentT)
isinstancetuplelistlenstrdict)sampleelems     r   _is_non_text_pairr8   Q   s     fudm,Fq0@&)S!jC&@ dD!fnd9JdD!dz$q'4/H	
 r   c                      e Zd ZdZ	 	 	 d	 	 	 	 	 	 	 ddZddZ	 	 	 	 ddZddZdddZ	 	 	 	 	 	 ddZ	e
dd	       Zdd
Z	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZy)InputFormattera  Handles input parsing, modality detection, and message format conversion.

    This class manages the complete input preprocessing pipeline:
    1. Parsing raw inputs to detect their modality (text, image, audio, video, message)
    2. Converting inputs to different chat template formats
    3. Normalizing mixed-modality inputs

    Different models require different message/chat template formats:
    - **Structured format**: Content is a list of dicts with type annotations
        [{"role": "user", "content": [{"type": "text", "text": "hello"}]}]

    - **Flat format**: Content is the direct value
        [{"role": "user", "content": "hello"}]

    Args:
        model_type: The model type string (e.g. from ``config.model_type``).
        message_format: Message format to use. Options:
            - ``"structured"``: Content is a list of dicts with type/modality keys
            - ``"flat"``: Content is the direct value
            - ``"auto"``: Automatically infer from processor (default)
        processor: Optional processor to infer format from when ``message_format="auto"``.
        supported_modalities: Optional list of modalities supported by the model. When provided,
            string inputs that look like media URLs/paths are only classified as non-text if the
            model actually supports that modality. This prevents text-only models from
            misclassifying text containing media URLs.
    Nc                    || _         || _        || _        |dk(  r!|r| j                  |      | _        y d| _        y || _        y )Nauto
structured)
model_type	processorsupported_modalities_infer_formatmessage_format)selfr>   rB   r?   r@   s        r   __init__zInputFormatter.__init__   sG     %"$8!V#CL$"4"4Y"?DR^D"0Dr   c                    | j                   t        v rt        | j                      S t        |dd      t        t              rsyg d}t        fd|D              ryy)a  Infer the message format expected by the processor.

        Checks known model types first, then inspects the processor's chat template
        for patterns indicating structured format. Defaults to ``"structured"`` if
        neither approach is conclusive.

        Args:
            processor: The processor/tokenizer to inspect.

        Returns:
            ``"structured"`` or ``"flat"`` message format.
        chat_templateNr=   )z
content[0]z.typez'type'z"type"z	item.typezmessage.content[c              3  &   K   | ]  }|v  
 y wN ).0patterntemplates     r   	<genexpr>z/InputFormatter._infer_format.<locals>.<genexpr>   s     Fww("Fs   r   )r>   !KNOWN_MODEL_TYPES_MESSAGE_FORMATSgetattrr0   r4   any)rC   r?   structured_patternsrL   s      @r   rA   zInputFormatter._infer_format   s]     ????4T__EE9ot<(C(
 F2EFFr   c                   |sddg it        t              fS g }t        t              }d}|D ]  }t        |      r|j                  d|f       d}%t	        || j
                        }|dk(  r"t        |t              r|d   }|d   |d   d<   n|dk(  rdt        ^t        |t              rN|j                         }|j                  j                  d	
      j                         }|j                  |d   d<   n|dk(  r=t        |t              r-|d   }|d   j                  dg       j                  |d          n|dk(  rt        t        |t              rt        |      }	|j!                  d	|	      }
|
j                  }|d   j                  dg       j                  |j"                  j$                  |j"                  j&                  t)        t+        |
j                  j,                  d	               d       n|dk(  rt        |t              r|g}n|}|j                  ||f        |rg }|D ]  \  }}|dk(  r!|j                  | j/                  |             ,|dk(  rEt        |t0        t(        f      r/t        |      dk(  r!|j                  | j/                  |             v|dk(  r|j                  |       t        |t0              r|n||i}|j                  | j3                  |              dd|i|fS t5        | \  }}t)        |      }t7        |      }t        |      dk(  r\|j9                         }t        |t:              r||i}n|d	   j=                         }|D ci c]  }||D cg c]  }||   	 c} }}}nUt>        jA                  d| d       d|D cg c]*  \  }}| j3                  t        |t0              r|n||i      , c}}i}d}|||fS c c}w c c}}w c c}}w )a-  Parse inputs and group by modality.

        Analyzes a list of inputs to detect their modality (text, image, audio, video, message)
        and groups them appropriately for the processor. Handles mixed modalities by converting
        to message format when necessary.

        Non-text pairs (e.g. ``(image, text)`` or ``(image, image)``) are detected and converted
        to message format with ``"query"``/``"document"`` roles via :meth:`pair_to_messages`.

        Args:
            inputs: List of inputs to parse. Can be:
                - str: Text inputs
                - tuple/list of str: Text pairs (for cross-encoders)
                - tuple/list of mixed types: Non-text pairs (e.g. image + text)
                - dict: Chat messages, audio data, or multimodal inputs
                - PIL.Image.Image: Image inputs
                - np.ndarray/torch.Tensor: Audio (1-2D) or video (3-5D) inputs

        Returns:
            Tuple of (modality, processor_inputs_dict, extra_modality_kwargs) where:
                - modality: Detected modality string (``"text"``, ``"image"``, etc.) or tuple of modalities
                - processor_inputs_dict: Dictionary mapping modality names to input lists
                - extra_modality_kwargs: Extra kwargs per modality (e.g. ``sampling_rate`` for audio)
        r   FpairTr@   audioarraysampling_rater   )dimvideovideo_metadata)fpstotal_num_framesframes_indicesmessager,   r-   zMixed modalities detected: z!. Converting to 'message' format.)!r   r5   r8   appendinfer_modalityr@   r0   r   get_all_samplesdatameannumpysample_rate
setdefaultr   r3   get_frames_in_rangemetadataaverage_fps
num_framesr2   rangeshapepair_to_messagesr1   
to_messagezipsetpopr4   keysloggerdebug)rC   inputstyped_inputsextra_modality_kwargs	has_pairsitemmodalityvaluesamplesrj   frame_batchmessagesmodtyped
modalitiesprocessed_inputsunique_modalitiesordered_keysentrys                      r   parse_inputszInputFormatter.parse_inputs   s   8 FB<T):::EG +D 1	 (	3D !&##VTN3 	%dAZAZ[H 7"z$'=WBFBW%g.?W$)AjQUWcFd..0))a)0668BIBUBU%g.?W$D$)?W%g.99:JBOVVW[\lWmnW$)AjQUWcFd Y
"66q*E#((%g.99:JBOVV#}}88,0MM,D,D*.u[5E5E5K5KA5N/O*P Y&:dD+A5 12Q(	3Z H* 	<
U&=OOD$9$9%$@AF]z%%'GCPUJZ[OOOD$9$9%$@AI%OOE*%/U%;E#uEOODOOE$:;	< y(35JJJ'*L'9$
$ 01
O !Q&(,,.H(C($,.>#?   02779_k#lX[CBR)S%*)S$S#l #lLL67H6IIjkl+7'% OOZ%-HExY^N_`  !H)+@@@ *T#ls   '
O%1O =O%%/O+ O%c                    |\  }}t        |      }t        |      }| j                  dk(  r
d|dd|dgS d }d |||      dd |||      dgS )a  Convert a pair of inputs to query/document message format.

        Each element of the pair is wrapped in a message with role ``"query"`` (first element)
        or ``"document"`` (second element). The modality of each element is inferred individually
        via :func:`infer_modality`.

        Args:
            pair: A 2-element tuple or list of inputs (e.g. ``(image, text)``).

        Returns:
            List of two message dictionaries with ``"query"`` and ``"document"`` roles.
        r   queryr.   r/   documentc                    t        | t              r,t        |t              r| D cg c]  }||v sd||||   i c}S d| | |igS c c}w )Ntype)r0   r1   r5   )rz   ry   r   s      r   _to_contentz4InputFormatter.pair_to_messages.<locals>._to_content@  sT     (E*z$/EAIY#STX[c495YYXx677 Zs
   	AA)r`   rB   )rC   rS   
query_itemdoc_itemquery_modalitydoc_modalityr   s          r   rm   zInputFormatter.pair_to_messages)  sy      $
H'
3%h/&( Z8#9 
	8 ^Z)PQKh,OP
 	
r   c           	        | j                   dk(  rNt        |      dk(  r+t        t        |j	                                     \  }}||dgS t
        j                  d       ||j	                         D cg c]  \  }}d|||i c}}dgS c c}}w )a  Convert a typed input dictionary to message format.

        Produces a single message with the given ``role``. For pair/multi-value inputs,
        use :meth:`pair_to_messages` instead (which is called automatically by :meth:`parse_inputs`).

        Args:
            typed_input: Dictionary mapping modality to input value (single value per modality).
            role: Role for the message (default: ``"user"``).

        Returns:
            List of message dictionaries (single message).
        r   r-   r   zbFlat message format requested but multiple modalities detected. Falling back to structured format.r   )rB   r3   nextiteritemsrs   warning)rC   typed_inputr.   _r{   rz   s         r   rn   zInputFormatter.to_messageL  s     &(;1$[%6%6%8 9:5!%%8999 WbWhWhWjkOHeVXx?k
 	
 ls   2B	c                P   |sddg ifS t        |t              r|fn|}t        t        t	        |j                                           }g }t        |      D ]  }|D ci c]  }||v s|||   |    }}t        |      dk(  ryt        t	        |j                                     }	t        |	t        t        f      rAt        |	      dk(  r3t        d |	D              r!|j                  | j                  |	             |j                  | j                  |              dd|ifS c c}w )ac  Convert a batch of modality-specific inputs into the unified message format.

        Args:
            modality: The modality key (string) or tuple of modality keys.
            processor_inputs: Dictionary mapping modality names to lists of inputs.

        Returns:
            Tuple of ``("message", {"message": [messages_per_sample, ...]})``
        r^   r-   r,   c              3  <   K   | ]  }t        |t                y wrH   )r0   r4   )rJ   vs     r   rM   z2InputFormatter.batch_to_message.<locals>.<genexpr>  s     OrghPZ[\^aPbOrs   )r0   r4   r3   r   r   valuesrk   r1   r2   allr_   rm   rn   )
rC   rz   processor_inputsr   
batch_sizer~   ir   r   r{   s
             r   batch_to_messagezInputFormatter.batch_to_messagej  s     y"o--$.x$=h[8
d#3#:#:#<=>?
z" 		:ADTjSX[_iXi3 0 5a 88jKj;1$T+"4"4"678eeT]3E
aCOrlqOrLrOOD$9$9%$@AOODOOK89		: 9h/// ks   	D#'D#c                    | D ]S  }|D ]L  }|j                  d      }t        |t              r%t        |t              rt	        d |D              sH  y  y U y)a  Check whether all messages in a batch contain only text content.

        Works with both flat format (``{"content": "hello"}``) and structured format
        (``{"content": [{"type": "text", "text": "hello"}]}``).

        Args:
            messages_batch: List of message lists, one per sample.

        Returns:
            True if every message contains only text, False if any contain non-text content.
        r/   c              3  F   K   | ]  }|j                  d d      dk7    yw)r   r   N)get)rJ   ry   s     r   rM   z7InputFormatter.is_text_only_messages.<locals>.<genexpr>  s      R$488FF3v=Rs   !FT)r   r0   r4   r2   rP   )messages_batchr~   r^   r/   s       r   is_text_only_messagesz$InputFormatter.is_text_only_messages  s`     ' 		!H# !!++i0gs+gt,R'RR$ !		! r   c                p   g }|D ]-  }d|vsd|vrt         j                  d| d       &|d   }t        |t              xr |xr t        |d   t              }| j
                  dk(  ri|rgt        |      dk(  r%d|d   v r|j                  i |d|d   d   i       t         j                  d	t        |       d
       |j                  |       | j
                  dk(  rB|s@t        |t              r|j                  i |dd|dgi       
|j                  |       |j                  |       0 |S )a;  Normalize messages to the target format (``self.message_format``).

        Extra keys beyond ``"role"`` and ``"content"`` are preserved during conversion.

        Args:
            messages: List of message dictionaries to normalize.

        Returns:
            Normalized list of message dictionaries.
        r.   r/   zInvalid message format: z. Skipping.r   r   r-   r   z;Cannot convert structured message to flat format: contains z# content items. Keeping structured.r=   r   r   )	rs   r   r0   r2   r5   rB   r3   r_   r4   )rC   r~   
normalizedr^   r/   is_currently_structureds         r   normalize_messagesz!InputFormatter.normalize_messages  sS    
 	+GW$	(@!9'+NOi(G&0$&?&lG&lPZ[bcd[egkPl#""f,1Hw<1$71:)=%%&P&P)WQZ=O&PQNN$$'L>1TV %%g.$$4=Tgs+%%&a&a)vW^>_=`&ab%%g.!!'*/	+2 r   c                    | j                   dk(  r|D cg c]  }d|dg|z    c}S |D cg c]  }dd|dgdg|z    c}S c c}w c c}w )a'  Prepend a system prompt to message format inputs.

        Args:
            messages: List of message lists (each message list represents one input).
            prompt: System prompt to prepend.

        Returns:
            Messages with system prompt prepended to each message list.
        r   systemr   r   r   )rB   )rC   r~   promptmessage_lists       r   prepend_prompt_to_messagesz)InputFormatter.prepend_prompt_to_messages  sn     &(_gh|h6:;lJhh !)
 VV,L+MNOR^^
 	
 i
s
   AAc           	         g }|D ]N  }t        |t              r|j                  ||z          (|j                  ||d   z   gt        |dd       z          P |S )ab  Prepend a prompt to text format inputs.

        For single texts, prepends the prompt directly.
        For text pairs (cross-encoder inputs), prepends only to the first text.

        Args:
            texts: List of text inputs (strings or pairs)
            prompt: Prompt to prepend

        Returns:
            Texts with prompt prepended
        r   r-   N)r0   r4   r_   r2   )rC   textsr   resultr   s        r   prepend_prompt_to_textsz&InputFormatter.prepend_prompt_to_texts  sb      	CD$$ftm,vQ/04QR>AB		C
 r   )r<   NN)r>   r4   rB   r   r@   list[Modality] | NonereturnNone)r   zLiteral['structured', 'flat'])ru   list[SingleInput | PairInput]r   zBtuple[Modality, dict[str, list], defaultdict[str, dict[str, Any]]])rS   ztuple | listr   list[dict[str, Any]])user)r   zdict[Modality, Any]r.   r4   r   r   )rz   r	   r   r5   r   z*tuple[Literal['message'], dict[str, list]])r   list[list[dict[str, Any]]]r   bool)r~   r   r   r   )r~   r   r   r4   r   r   )r   z'list[str | tuple[str, str] | list[str]]r   r4   r   zlist[str | list[str]])__name__
__module____qualname____doc__rD   rA   r   rm   rn   r   staticmethodr   r   r   r   rI   r   r   r:   r:   f   s    < )/6:11 &1
 41 
1 DtA-tA 
LtAl!
F
<0 0480	30:  0%N
2
<?
	#
&<FI	r   r:   c                   t         t        | t               ryt        t        | t              ryt        t        | t              ry| xt        d x\   t        |       r
 |d|vryy xt        d x\   t        |       r
 |d|vryy xt        d x\   t        |       r
 |d|vryy xxt        d x\     y x  r! dk(  r\  t        d x\   t        d x\     y  x  r! dk(  r\  t        d x\   t        d x\     y   xt        d x\   d| v rd| v r y	 xt        d x'\   | r$t        | d
   t              rd| d
   v r
d| d
   v r y	 xt        d x\   d| v rd| v r y xt        d x\   d| v rd| v r y xt        d x,\   d| v r' t        dt        | j                                       xt        d xZ\   | rW h d}t        | j                               |z
  }|rt        d| d|       t        t        | j                                     S  xt        d x\    t        d       xt        j                   d x\   n xt"        j$                  d x\   n  nH | j&                  dv ry| j&                  dk(  ry| j&                  dv ryt        d| j&                   d      	 t        dt)        |       j*                   d      )a  Infer the modality of a single input sample by inspecting its type/structure.

    Pure type-based detection, does not require a processor or tokenizer.

    Args:
        sample: A single input sample to inspect.
        supported_modalities: Optional list of modalities the model supports. When provided,
            string inputs that would be classified as image/video/audio based on URL/path
            heuristics are instead classified as ``"text"`` if that modality is not supported.
            This prevents misclassification of text that happens to contain media URLs.

    Returns:
        The detected modality string, or a tuple of modality strings for multimodal dict inputs.

    Raises:
        ValueError: If the input type/structure is not recognized.
    imagerU   rY   rI   r   r,   r.   r/   r^   r   rV   rW   rZ   zuDict input with 'array' key must also include 'sampling_rate' (for audio) or 'video_metadata' (for video). Got keys: >   r   rU   r   rY   z;Multimodal dict input contains unrecognized modality keys: z. Expected keys from: z-Empty dict input is not a valid input sample.)r-   r,      )      z#Unsupported tensor dimensionality: z<D. Expected 1-2D for audio, 3D for image, or 4-5D for video.zUnsupported input type: zG. Expected one of: str, dict, PIL.Image.Image, np.ndarray, torch.Tensor)PILImager0   r   r   r4   r%   r(   r*   r5   r2   
ValueErrorrp   rr   r1   sortednpndarraytorchTensorndimr   r   )r6   r@   valid_modalitiesinvalid_keyss       r   r`   r`     s   , 
68 <Jv|$DJv|$D
SU*62U#/GCW4W  SU*62U#/GCW4W  SU*62U#/GCW4W  5USU4 ^^ceSU4 $nnsuce4 '54TVv'I,?V TVv*VAY"=&FSTIBUZcgmnogpZpV TVw&(_-FV TVw&(-=-GV TVw&(V /02   TVvVBv{{}-0@@L QR^Q_ `++;*<>  .//  TVLMM RZZ\NELLN*{{f$!& 9&++ GP Q  *4<+@+@*A BX Y r   c                    | sy| D ch c]  }t        ||       }}t        |      dk(  r|j                         S dS c c}w )a  Infer the modality of a batch of input samples.

    If all samples share the same modality, that modality is returned. If the batch contains
    mixed modalities, ``"message"`` is returned, consistent with how :class:`InputFormatter`
    handles mixed-modality batches in :meth:`~InputFormatter.parse_inputs`.

    Args:
        samples: List of input samples to inspect.
        supported_modalities: Optional list of modalities the model supports. Passed through
            to :func:`infer_modality` to prevent misclassification of text as media modalities.

    Returns:
        The detected modality, or ``"message"`` for mixed-modality batches.
    r   rT   r-   r^   )r`   r3   rq   )r|   r@   r6   r   s       r   infer_batch_modalityr   M  sI    $ bijX^.>RSjJj":!3:>>BB ks   >c                H    t        | t              rdj                  |       S | S )zSFormat a modality for display, e.g. ``("text", "image")`` becomes ``"text+image"``.+)r0   r1   join)rz   s    r   format_modalityr   e  s     (E"xx!!Or   )r   r4   r   r   )r   r4   r"   ztuple[str, ...]r   r   )r6   r   r   r   rH   )r6   zSingleInput | PairInput | Anyr@   r   r   r	   )r|   r   r@   r   r   r	   )rz   r	   r   r4   )*r   
__future__r   loggingr    collectionsr   typingr   r   urllib.parser   rd   r   r   )sentence_transformers.base.modality_typesr   r	   r
   r   	PIL.Imager   r   ImportErrortorchcodec.decodersr   r   OSError	getLoggerr   rs   rN   r   r#   r%   r(   r*   r8   r:   r`   r   r   rI   r   r   <module>r      s   G "  	 #  !   +>
 
		8	$
 	% !H
Fd	R
*I I\ 37X)X/X Xz 37C*C/C C0]  H
 	W LLs#   B B, B)(B),B:9B: