
    is#                        d dl Z d dlmZmZmZmZmZmZmZ ddl	m
Z
mZ ddlmZmZ dee   dee   fdZdee   dee   fd	Zdee   dee   fd
Zdee   dee   fdZddedefdZdedee   fdZ	 ddedeeeeeeef   f      dedee   fdZdedee   dee   fdZdedee   deeeeeeef   f      fdZdee   deeeeef      fdZdedeeef   fdZdedefdZeZeZeZy)    N)DictIterableIteratorListTupleUnioncast   )ErrorsWarnings)DocSpantagsreturnc                     g }t        |       } | r7|j                  t        |              |j                  t        |              | r7|S )N)listextend_consume_os_consume_ent)r   outs     i/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/spacy/training/iob_utils.pyiob_to_biluor      sA    C:D


;t$%

<%&  J    c                     g }| D ]K  }||j                  |       |j                  ddd      j                  ddd      }|j                  |       M |S )NU-B-   L-I-)appendreplace)r   r   tags      r   biluo_to_iobr#      sY    
C ;JJsO++dD!,44T4CCJJsO Jr   c              #   b   K   | r)| d   dk(  r | j                  d       | r
| d   dk(  ry y y y w)Nr   O)pop)r   s    r   r   r      s6     
47c>hhqk 47c>$>$s   (//c                    | sg S | j                  d      }d|dd  z   }d|dd  z   }d}| r+| d   ||hv r"|dz  }| j                  d       | r
| d   ||hv r"|dd  }|dk(  r=t        |      dk(  r)t        t        j                  j                  |            d|z   gS d|z   }d	|z   }t        d|dz
        D cg c]  }d
| 	 }	}|g|	z   |gz   S c c}w )Nr   Ir   Lr
   r"   r   r   r   r   )r&   len
ValueErrorr   E177formatrange)
r   r"   	target_intarget_lastlengthlabelstartend_middles
             r   r   r   !   s   	
((1+Cc!"gIAB-KF
47y+66! 47y+66 GE{u:?V[[//C/899u~uUl(-a!(<=1Bug,==w3%'' >s   7Cdocmissingc           	          t        | | j                  D cg c]%  }|j                  |j                  |j                  f' c}|      S c c}w )Nr9   )offsets_to_biluo_tagsents
start_charend_charlabel_)r8   r9   ents      r   doc_to_biluo_tagsrB   7   s=     ?BxxH#..#,,

	3H Hs   *A
c                 p    t        | d      }t        |       D ]  \  }}|j                  dk(  sd||<    |S )N-r;   r
   r%   )rB   	enumerateent_iob)r8   r=   itokens       r   _doc_to_biluo_tags_with_partialrI   ?   sA    S#.DcN 5==ADG Kr   entitiesc           
      
   i }| D ci c]  }|j                   |j                   }}| D ci c]%  }|j                   t        |      z   |j                  ' }}| D cg c]  }d }}|D ]  \  }	}
}|s|D ]  }||	k\  s	||
k  sd|||   <    %t        |	|
      D ]^  }||j	                         v rBt        t        j                  j                  ||   d   ||   d   ||   d   f|	|
|f            |	|
|f||<   ` |j                  |	      }|j                  |
      }||||k(  r	d| ||<   d	| ||<   t        |dz   |      D ]
  }d
| ||<    d| ||<    t               }|D ](  \  }	}
}t        |	|
      D ]  }|j                  |        * | D ]H  }t        |j                   |j                   t        |      z         D ]  }||v s 9 |||j                  <   J d|v r|dk7  rt        |      }t        j                  t        j                   j                  t        | j"                        dkD  r| j"                  dd dz   n| j"                  t        |      dkD  r|dd dz   n|             |S c c}w c c}w c c}w )u  Encode labelled spans into per-token tags, using the
    Begin/In/Last/Unit/Out scheme (BILUO).

    doc (Doc): The document that the entity offsets refer to. The output tags
        will refer to the token boundaries within the document.
    entities (iterable): A sequence of `(start, end, label)` triples. `start`
        and `end` should be character-offset integers denoting the slice into
        the original string.
    missing (str): The label used for missing values, e.g. if tokenization
        doesn’t align with the entity offsets. Defaults to "O".
    RETURNS (list): A list of unicode strings, describing the tags. Each tag
        string will be of the form either "", "O" or "{action}-{label}", where
        action is one of "B", "I", "L", "U". The missing label is used where the
        entity offsets don't align with the tokenization in the `Doc` object.
        The training algorithm will view these as missing values. "O" denotes a
        non-entity token. "B" denotes the beginning of a multi-token entity,
        "I" the inside of an entity of three or more tokens, and "L" the end
        of an entity of two or more tokens. "U" denotes a single-token entity.

    EXAMPLE:
        >>> text = 'I like London.'
        >>> entities = [(len('I like '), len('I like London'), 'LOC')]
        >>> doc = nlp.tokenizer(text)
        >>> tags = offsets_to_biluo_tags(doc, entities)
        >>> assert tags == ["O", "O", 'U-LOC', "O"]
    rD   r%   r   r   r
   )span1span2Nr   r   r   r   2   z...)textrJ   )idxrG   r+   r/   keysr,   r   E103r.   getsetaddstrwarningswarnr   W030rO   )r8   rJ   r9   tokens_in_entsrH   startsendsr6   biluor>   r?   r3   stoken_indexstart_token	end_tokenrG   entity_charsent_strs                      r   r<   r<   G   s   < CEN.12Ueii 2F29<=EIIE
"EGG+=D=QSE'/ 4#
He +
?q8|'*E&)$+  %Z: L."5"5"77$** .{ ;A > .{ ;A > .{ ;A >#
 $.x"? + 	 	 0:8U.K{+L !**Z0K*I&9+@)++-eWE+&+-eWE+&";?I> 0%'w<a0)+E7|E)$;4> 5L'/  #
Hez8, 	 AQ	    %uyy%))c%j"89 	%AL 	% %E%''N% e|3h-MM  .1#((mb.@SXXcr]U*chh14W1B"- ! 	
 Lk 3=s   I6*I;	J c                 |    t        |      }g }|D ])  \  }}}t        | ||dz   |      }|j                  |       + |S )a  Encode per-token tags following the BILUO scheme into Span object, e.g.
    to overwrite the doc.ents.

    doc (Doc): The document that the BILUO tags refer to.
    tags (iterable): A sequence of BILUO tags with each tag describing one
        token. Each tag string will be of the form of either "", "O" or
        "{action}-{label}", where action is one of "B", "I", "L", "U".
    RETURNS (list): A sequence of Span objects. Each token with a missing IOB
        tag is returned as a Span with an empty label.
    r   r3   )tags_to_entitiesr   r    )r8   r   token_offsetsspansr3   	start_idxend_idxspans           r   biluo_tags_to_spansrl      sP     %T*ME%2 !y'CGaKu=T Lr   c                     t        | |      }|D cg c]%  }|j                  |j                  |j                  f' c}S c c}w )a  Encode per-token tags following the BILUO scheme into entity offsets.

    doc (Doc): The document that the BILUO tags refer to.
    tags (iterable): A sequence of BILUO tags with each tag describing one
        token. Each tags string will be of the form of either "", "O" or
        "{action}-{label}", where action is one of "B", "I", "L", "U".
    RETURNS (list): A sequence of `(start, end, label)` triples. `start` and
        `end` will be character-offset integers denoting the slice into the
        original string.
    )rl   r>   r?   r@   )r8   r   rh   rk   s       r   biluo_tags_to_offsetsrn      s7      T*EFKLdT__dmmT[[9LLLs   *>c           
         g }d}t        |       D ]]  \  }}||j                  d      r|d}|j                  d||f       4|j                  d      rF|j                  d      r<|Zt        t        j
                  j                  dt        |       d|dz                |j                  d      r|j                  |d	d ||f       |j                  d
      r|}|j                  d      rV|9t        t        j
                  j                  dt        |       d|dz                |j                  |d	d ||f       d}7t        t        j                  j                  |             |S )zxNote that the end index returned by this function is inclusive.
    To use it for Span creation, increment the end by 1.NrD    r%   r(   r   )r4   r   Ur
   Br)   r*   )	rE   
startswithr    r,   r   E067r.   r   E068)r   rJ   r4   rG   r"   s        r   rf   rf      sd    HED/ :3;#..- Q
+^^C ^^C } KK&&StDz'AE7J&K  ^^C OOSWaO,^^C E^^C } KK&&StDz'AE7J&K  OOSWeQ/0EV[[//C/8995:6 Or   r3   c                 \    t        t        t        t        f   | j                  dd            S NrD   r   )r	   r   rV   splitre   s    r   split_bilu_labelry      s"    c3hS!!455r   c                 ,    | j                  dd      d   S rw   )rx   re   s    r   remove_bilu_prefixr{      s    ;;sAq!!r   )r%   ) rW   typingr   r   r   r   r   r   r	   errorsr   r   tokensr   r   rV   r   r#   r   r   rB   rI   intr<   rl   rn   rf   ry   r{   offsets_from_biluo_tagsspans_from_biluo_tagsbiluo_tags_from_offsets r   r   <module>r      s    E E E % x} c x} c d3i HSM 
(tCy (T#Y (,3   c  TWT	T sCsCx'@!ABTMPT	#YTnS  $t* &M	MSMM	%S%S/)
*+M" 8C=  T%S#2F-G  F6C 6E#s(O 6"c "c "
 0 + / r   