
    i.                        d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
mZ d dlZddlmZ ddlmZmZ ddlmZmZ ddlmZ d	d
lmZ d	dlmZ erddlmZ dZej<                  j?                  d      	 	 	 d(de
e   de de!de!de
e   dedgee   f   fd       Z"ej<                  j?                  d      	 	 	 d)de
ee#ef      de!de!de!dedgee   f   f
d       Z$ej<                  j?                  d      dddee#ef   de fd       Z%ej<                  j?                  d      	 	 d*de
e   de!de!dedgee   f   fd        Z&dee#ef   de	e   fd!Z' G d" d#      Z( G d$ d%      Z) G d& d'      Z*y)+    N)Path)TYPE_CHECKINGCallableIterableIteratorListOptionalUnion   )util)ErrorsWarnings)DocDocBin)Vocab   )dont_augment)Example)Languagez.spacyzspacy.Corpus.v1pathgold_preproc
max_lengthlimit	augmenterreturnr   c                     | t        t        j                        t        j                  j                  d|        t        | ||||      S )NzLoading corpus from path: %s)r   r   r   r   )
ValueErrorr   E913r   loggerdebugCorpus)r   r   r   r   r   s        f/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/spacy/training/corpus.pycreate_docbin_readerr#      sI     |%%KK4d;!     zspacy.JsonlCorpus.v1
min_lengthc                      t        | |||      S )N)r%   r   r   )JsonlCorpus)r   r%   r   r   s       r"   create_jsonl_readerr(   *   s     t
zQVWWr$   zspacy.read_labels.v1F)requirer)   c                h    t        |       } |s| j                         sy t        j                  |       S N)r   existssrsly	read_json)r   r)   s     r"   read_labelsr/   4   s*     :D4;;=??4  r$   zspacy.PlainTextCorpus.v1c                 T    | t        t        j                        t        | ||      S )  Iterate Example objects from a file or directory of plain text
    UTF-8 files with one line per doc.

    path (Path): The directory or filename to read from.
    min_length (int): Minimum document length (in tokens). Shorter documents
        will be skipped. Defaults to 0, which indicates no limit.
    max_length (int): Maximum document length (in tokens). Longer documents will
        be skipped. Defaults to 0, which indicates no limit.

    DOCS: https://spacy.io/api/corpus#plaintextcorpus
    r%   r   )r   r   r   PlainTextCorpus)r   r%   r   s      r"   create_plain_text_readerr4   >   s'    " |%%4J:NNr$   c                    t        j                  |       } | j                         s!| j                  d   j	                  |      r| gS | }| g}g }t               }|D ]  } t        |       |v r|j                  t        |              | j                  r| j                  d   j                  d      rV| j                         r |j                  | j                                | j                  d   j	                  |      s|j                  |         t        |      dk(  r4t        j                  t        j                   j#                  ||             |j%                          |S )N.r   )r   format)r   ensure_pathis_dirpartsendswithsetstradd
startswithextenditerdirappendlenwarningswarnr   W090r8   sort)r   	file_type	orig_pathpathslocsseens         r"   walk_corpusrN   T   s   D!D;;=TZZ^44Y?vIFED5D 	t9T::$**R.33C8[[]LL(ZZ^$$Y/KK	 4yA~hmm**	)*LMIIKKr$   c                       e Zd ZdZdddddddeeef   deded	ed
e	e
   deddfdZdddee   fdZdddededefdZdddee   dee   fdZdddee   dee   fdZdedeeeef      dee   fdZy)r!   a6  Iterate Example objects from a file or directory of DocBin (.spacy)
    formatted data files.

    path (Path): The directory or filename to read from.
    gold_preproc (bool): Whether to set up the Example object with gold-standard
        sentences and tokens for the predictions. Gold preprocessing helps
        the annotations align to the tokenization, and may result in sequences
        of more consistent length. However, it may reduce run-time accuracy due
        to train/test skew. Defaults to False.
    max_length (int): Maximum document length. Longer documents will be
        split into sentences, if sentence boundaries are available. Defaults to
        0, which indicates no limit.
    limit (int): Limit corpus to a subset of examples, e.g. for debugging.
        Defaults to 0, which indicates no limit.
    augment (Callable[Example, Iterable[Example]]): Optional data augmentation
        function, to extrapolate additional examples from your annotations.
    shuffle (bool): Whether to shuffle the examples.

    DOCS: https://spacy.io/api/corpus
    r   FN)r   r   r   r   shuffler   r   r   r   r   rP   r   c                    t        j                  |      | _        || _        || _        || _        ||nt        | _        || _        y r+   )	r   r9   r   r   r   r   r   r   rP   )selfr   r   r   r   r   rP   s          r"   __init__zCorpus.__init__   sA     $$T*	($
&/&;r$   nlpr   c              #   r  K   | j                  |j                  t        | j                  t                    }| j
                  r t        |      }t        j
                  |       | j                  r| j                  ||      }n| j                  ||      }|D ]  }| j                  ||      D ]  }|   yw)zYield examples from the data.

        nlp (Language): The current nlp object.
        YIELDS (Example): The examples.

        DOCS: https://spacy.io/api/corpus#call
        N)read_docbinvocabrN   r   	FILE_TYPErP   listrandomr   make_examples_gold_preprocmake_examplesr   )rR   rT   ref_docsexamplesreal_egaugmented_egs         r"   __call__zCorpus.__call__   s      ##CII{499i/PQ<<H~HNN8$66sHEH))#x8H 	#G $sG < #""#	#s   B5B7	referencec                 2   |s|j                   r[t        t        |j                  |D cg c]  }|j                   c}|D cg c]  }t        |j                         c}      |      S t        |j                  |j                        |      S c c}w c c}w )Nwordsspaces)has_unknown_spacesr   r   rW   textboolwhitespace_make_doc)rR   rT   rb   r   words        r"   _make_examplezCorpus._make_example   s     977II1:;499;?HItD!1!12I
   3<<	7CC <Is   BBreference_docsc              #     K   |D ]  }t        |      dk(  r| j                  dk(  st        |      | j                  k  r| j                  ||d       O|j                  d      sa|j                  D ]\  }t        |      dk(  r| j                  dk(  st        |      | j                  k  s:| j                  ||j                         d       ^  y w)Nr   F
SENT_START)rD   r   rm   has_annotationsentsas_doc)rR   rT   rn   rb   ref_sents        r"   r\   zCorpus.make_examples   s      ( 
	PI9~"A%Y$//)I((i??)),7 ) PH8}) A-X1P"00hoo6GOO	P
	Ps   A#C&AC,(Cc              #     K   |D ]r  }|j                  d      r(|j                  D cg c]  }|j                          }}n|g}|D ]/  }| j                  ||d      }t	        |j
                        s,| 1 t y c c}w w)Nrp   T)rq   rr   rs   rm   rD   x)rR   rT   rn   rb   sent	ref_sentsrt   egs           r"   r[   z!Corpus.make_examples_gold_preproc   s      ( 	I''57@GtT[[]G	G&K	% ''Xt<rtt9H	Gs   %B A;3B 2B rW   rL   c              #   \  K   d}|D ]  }t        j                  |      }|j                  d   j                  t              s;t               j                  |      }|j                  |      }|D ]8  }t        |      s| |dz  }| j                  dk\  s(|| j                  k\  s8   yw)z(Yield training examples as example dictsr   r6   r   N)
r   r9   r;   r<   rX   r   	from_diskget_docsrD   r   )rR   rW   rL   ilocdoc_bindocsdocs           r"   rV   zCorpus.read_docbin   s       
	"C""3'Cyy}%%i0 (,,S1''. "C3x!	Q::?qDJJ!"
	"s   ?B,:B,=B,B,&B,)__name__
__module____qualname____doc__r
   r>   r   intri   r	   r   rS   r   r   ra   r   rm   r   r\   r[   r   rV    r$   r"   r!   r!   m   s0   2 "(,CI 	
   H%  
"#J #8G+< #*DD*-D=AD	DPP/7}P	'	P/7}	'	"""*5d+;"<"	#"r$   r!   c                   `    e Zd ZdZdZdddddeeeef      de	de	de	d	d
f
dZ
ddd	ee   fdZy
)r'   ac  Iterate Example objects from a file or directory of jsonl
    formatted raw text files.

    path (Path): The directory or filename to read from.
    min_length (int): Minimum document length (in tokens). Shorter documents
        will be skipped. Defaults to 0, which indicates no limit.

    max_length (int): Maximum document length (in tokens). Longer documents will
        be skipped. Defaults to 0, which indicates no limit.
    limit (int): Limit corpus to a subset of examples, e.g. for debugging.
        Defaults to 0, which indicates no limit.

    DOCS: https://spacy.io/api/corpus#jsonlcorpus
    jsonlr   )r   r%   r   r   r   r%   r   r   Nc                b    t        j                  |      | _        || _        || _        || _        y r+   )r   r9   r   r%   r   r   )rR   r   r   r%   r   s        r"   rS   zJsonlCorpus.__init__   s+     $$T*	$$
r$   rT   r   c           
   #     K   t        | j                  d      D ]  }t        j                  |      }|D ]  }|j	                  |d         }| j
                  dk\  rt        |      | j
                  k  r?| j                  dk\  rt        |      | j                  k\  rg|D cg c]  }|j                   }}|D cg c]  }t        |j                         }}t        |t        |j                  ||               yc c}w c c}w w)zYield examples from the data.

        nlp (Language): The current nlp object.
        YIELDS (Example): The example objects.

        DOCS: https://spacy.io/api/corpus#jsonlcorpus-call
        z.jsonlrh   r   rd   N)rN   r   r-   
read_jsonlrk   r%   rD   r   rh   ri   rj   r   r   rW   )	rR   rT   r~   recordsrecordr   wre   rf   s	            r"   ra   zJsonlCorpus.__call__  s      tyy(3 	SC&&s+G! Sll6&>2??a'CHt,F__)c#h$//.I-01QVV1E1;>?ad1==1?F? "#s399E&'QRRS	S 2?s   BDC>1D7D5Dr   r   r   r   rI   r	   r
   r>   r   r   rS   r   r   ra   r   r$   r"   r'   r'      sw     I uS$Y'( 	
   
SJ S8G+< Sr$   r'   c            	       Z    e Zd ZdZdZddddeeeef      de	de	dd	fd
Z
dddee   fdZy	)r3   r1   txtr   r2   r   r%   r   r   Nc                T    t        j                  |      | _        || _        || _        y r+   )r   r9   r   r%   r   )rR   r   r%   r   s       r"   rS   zPlainTextCorpus.__init__,  s$     $$T*	$$r$   rT   r   c              #     K   t        | j                  d      D ]  }t        |d      5 }|D ]  }|j                  d      }t	        |      s |j                  |      }| j                  dk\  rt	        |      | j                  k  rY| j                  dk\  rt	        |      | j                  kD  rt        ||j                                 	 ddd        y# 1 sw Y   xY ww)zYield examples from the data.

        nlp (Language): The current nlp object.
        YIELDS (Example): The example objects.

        DOCS: https://spacy.io/api/corpus#plaintextcorpus-call
        z.txtzutf-8)encodingz
r   N)
rN   r   openrstriprD   rk   r%   r   r   copy)rR   rT   r~   frh   r   s         r"   ra   zPlainTextCorpus.__call__7  s      tyy&1 	7CcG, 7 
7D;;v.D4y!ll40??a/CHt4N$!__1c#h6P$ &c388:66
77 7	77 7s#   'C#"CA?CC#C 	C#r   r   r$   r"   r3   r3     sg    
 I 	%uS$Y'(	% 		%
 	% 
	%7J 78G+< 7r$   r3   )r   r   N)r   r   r   )r   r   )+rZ   rE   pathlibr   typingr   r   r   r   r   r	   r
   r-    r   errorsr   r   tokensr   r   rW   r   augmentr   exampler   languager   rX   registryreadersri   r   r#   r>   r(   r/   r4   rN   r!   r'   r3   r   r$   r"   <module>r      s(      U U U   %    ! #	 () $(
4.  	
 ! zlHW--. *& -. 	X
5d#
$XX X 	X
 zlHW--.X /X -.;@ !eCI& !D ! /! 12 O
4.OO O zlHW--.	O 3O*eCI& d4j 2v" v"r4S 4Sn.7 .7r$   