
    i                     >   d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	m
Z
mZmZmZmZmZmZmZ d dlZd dlZd dlZd dlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 ddlm2Z2m3Z3 ddl4m5Z6 ddl7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z? dZ@dZAdZBdZCdZDdZEdZF e<j                  dddd       e;j                  ddddd        e8d!d"dd#       e9dd$d%d&d'(       e9d)d*d+d,(       e9d)d-d.d/(       e9d)d0d1d2(      fd3ej                  d4ed5ee   d6eId7eId8eIfd9              ZJi d)d)ddd:d4ed;eeKef   d6eId7eId8eId<eIfd=ZLd>ed?ed@dfdAZMdBee.   dCe
eK   dDedEeId@eeKef   f
dFZNedldGe	eK   dHed)   d@eKfdI       ZOedGe	eeKePf      dHed   d@eKfdJ       ZO	 dldGee	eK   e	eeKePf      f   dHeId@eKfdKZOdmdLeePeQf   dMeId@eKfdNZR	 	 dndee.   dOeKdPedQ   dReeK   d@ePf
dSZSdDedTeKd@eeK   fdUZTdDed@eeKeeK   f   fdVZUdWe
d@eQfdXZVdYeeKeQf   dZed@eQfd[ZWdmd\eId@efd]ZXd^ed_ed@eQfd`ZYdae
e   dGe
eK   d@e
e   fdbZZdBe
e.   dceeKef   dReKd@eeKef   fddZ[deeeKef   fdfZ\eFfdged@eePeQf   fdhZ]dieePeQf   djePd@eePeQf   fdkZ^y)o    N)Counter)Path)AnyDictIterableListOptionalSequenceSetTupleUnioncastoverload)MESSAGESPrintermsg   )util)Literal)Language)
Morphology)MorphologizerSpanCategorizerTrainablePipe)	EditTrees)nonproj)	DELIMITER)ConfigSchemaTraining)Exampleremove_bilu_prefix)get_sourced_components)registryresolve_dot_names)Mode   )ArgOpt_format_numberapp	debug_cliimport_codeparse_config_overridesshow_validation_error2      d   i  Z   dataT)allow_extra_argsignore_unknown_options)context_settings
debug-data)r5   hidden.zPath to config file)helpexists
allow_dashz--code-pathz--codez-czNPath to Python file with additional code (registered functions) to be imported)r8   Fz--ignore-warningsz-IWz+Ignore warnings, only show stats and errorsz	--verbosez-Vz-Print additional information and explanationsz--no-formatz-NFzDon't pretty-print the resultsctxconfig_path	code_pathignore_warningsverbose	no_formatc                     | j                   j                  dk(  rt        j                  d       t	        | j
                        }t        |       t        |||||d       y)a  
    Analyze, debug and validate your training and development data. Outputs
    useful stats, and can help you find problems like invalid entity annotations,
    cyclic dependencies, low data labels and more.

    DOCS: https://spacy.io/api/cli#debug-data
    r6   zThe debug-data command is now available via the 'debug data' subcommand (without the hyphen). You can run python -m spacy debug --help for an overview of the other available debugging commands.Fconfig_overridesr>   r?   r@   silentN)commandnamer   warnr,   argsr+   
debug_data)r;   r<   r=   r>   r?   r@   	overridess          e/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/spacy/cli/debug_data.pydebug_data_clirL   ?   s[    T {{<'P	

 'sxx0I	"'    rB   rC   rD   c                p/  NO t        || |      }t        |       5  t        j                  | |      }t        j                  |      NNj
                  j                         }t        j                  |d   t              }	d d d        t              }
	d   }|
D cg c]	  }||vs| }}Nj                  }Nj                  D cg c]  }Nj                  |      j                   }} |j                  d       |	d   |	d   g}t        |      \  O}Nj!                  NOfd	        |j"                  d
       t%         ON            }t%         |N            } |j"                  d       t'        ||Nd      }t'        ||Nd      }t'        ||Nd      }|d   }|d   }|	d   } |j                  d        |j(                  dNj*                           |j(                  ddj-                  |              |r$ |j(                  ddj-                  |              |r$ |j(                  ddj-                  |               |j(                  t/        |       d        |j(                  t/        |       d       t/        |      s |j0                  d       t/        |j3                  |            }|r |j4                  | d       n |j"                  d       |s{t/        |      t6        k  ridt/        |       d}t/        |      t8        k  r |j0                  |       n |j4                  |        |j(                  dt6         dt8         d|        |j                  d        |d!   } |j:                  | d"t/        |d#          d$       |d%   d&kD  r|d%   } |j4                  | d'       |d%   d&kD  r|d%   } |j4                  | d(       |d#   j=                  d)      } |j(                  d*t?        |d+       |       t/        Nj@                  jB                        rNj@                  jB                  jD                  tF        jH                  k(  r |j:                  d,t/        Nj@                  jB                         d-Nj@                  jJ                   d.Nj@                  jB                  jL                   d/Nj@                  jB                  jN                   d0	       n |j:                  t/        Nj@                  jB                         d1Nj@                  jB                  jP                   d2Nj@                  jJ                   d3       tS        |d4   jU                               } |j4                  d5jW                  |d6||d!   z  z                |j(                  d7jW                  t?        |d4   j=                  d)      d+            |       n |j:                  d8       d9|v sd:|v r#tY        N      } d}!d}" |j                  d;        |jZ                  | d<d=gd>        |j(                  d?|       |d9   j]                         D ]8  \  }#}$ |j(                  d@|# dt?        |$j]                         d+       |       : | j_                         D #ci c]  }#|#|d9   |#    }%}#|%j]                         D ]  \  }#}$|$j]                         D ]  \  }&}'|#| j_                         v }(|(r |&| |#   vr |j4                  dA|& dB|# dC       |'t`        k  r |j4                  dD|& dE|# dF|' d       d}! |jb                  dG      5  te        ||&d9|#      })d d d        )d&k(  s |j4                  dH|& dI       d}"  |jb                  dJ      5  tg        |||#      }*d d d         |j:                  dK|# dI        |j:                  dL       ti        *       tk        |dM   |#         }+tm        |+tn        N      }, |j:                  dOtn         dPtq        |,j_                                dQ|*dR    dS|*dT    dUts        |,       dV        |j(                  dWts        |+       |       |*dX   tt        k  r |j4                  dY       n |j"                  dZ       |*d[   jU                         }-tS        |-tw                     }.|.j=                  d)      D /0cg c]  \  }/}0|/	 }1}/}0 |j(                  d\jW                  t?        |1            |       |*d]   tx        k  r |j4                  d^       n |j"                  d_       |*d`   jU                         }2tS        |2tw                     }3|3j=                  d)      D /0cg c]  \  }/}0|/	 }4}/}0 |j(                  dajW                  t?        |4            |        |!r |j(                  dbt`         dc|       n |j"                  dd       |"r |j(                  de|       n |j"                  df       dg|v rat{        dh |dg   D              }5|dg   }6t}        Ndg      }7d}!d}"d}8d}9 |j                  di        |j:                  t/        |7       dj       |6d/   }: |j(                  |: dk       |5D ]#  }&t/        |&      d&k(  s |j0                  dl       % |6j=                         D &'cg c]  \  }&}'|&d/k7  r|&|'f };}&}'t?        |;d+      }; |j(                  dm|; |       |7|5z
  }<|<r |j4                  dnt?        |<       do       |dp   r |j0                  |dp    dq       d}8|5D ]s  }&|6|&   t`        k  s |j4                  dD|& dF|6|&    d       d}! |jb                  dG      5  te        ||&dg      })d d d        )d&k(  s\ |j4                  dH|& dI       d}"u |dr   r |j4                  |dr    ds       d}9|!s |j"                  dd       |"s |j"                  df       |8s |j"                  dt       |9s |j"                  du       |!r |j(                  dvt`         dc|       |"r |j(                  dw|       |8r |j(                  dx       dy|v rS |j                  dz       t}        Ndy      }5 |j:                  d{t/        |5       dj        |j(                  d|t?        |5       |       |5t{        |d}         z
  }<|<r |j4                  dnt?        |<       do       t{        |d}         t{        |d}         k7  r1 |j4                  d~t?        |d}          dt?        |d}          do       t/        |5      dk  r |j0                  d       |d   d&kD  s|d   d&kD  r |j0                  d       |d   d&kD  r |j0                  d       |d   d&kD  r |j0                  d       d|v rN |j                  d       t}        Nd      }5 |j:                  d{t/        |5       dj        |j(                  d|t?        |5       |       |5t{        |d}         z
  }<|<r |j4                  dnt?        |<       do       t{        |d}         t{        |d}         k7  r1 |j4                  d~t?        |d}          dt?        |d}          do       |d   d&kD  s|d   d&kD  r |j0                  d       |d   d&kD  r|d   d&k(  r? |j4                  d       n, |j4                  d       |d   d&kD  r |j0                  d       d|v r> |j                  d       t        |d   j]                          \  }=}> |j:                  t/        |=       d       t        j                  |>      }||jS                         z  }| t        j                  |      z  jS                         t        j                  t/        |=            z  }? |j:                  |? d       t}        Nd      }7t{        |=      }5|7|5z
  }<|<r |j4                  dnt?        |<       do       t?        |d   j=                         d+      }; |j(                  |;|       d|v r |j                  d       |d   D &cg c]  }&|& }=}&t}        Nd      }7 |j:                  t/        |=       d       t{        |=      }5|7|5z
  }<|<r |j4                  dnt?        |<       do       t?        |d   j=                         d+      }; |j(                  |;|       d|v r\d}! |j                  d        |j:                  d|d    d|d!   |d   z  dd       |d   t/        |d         z  }@|@dk  r |j4                  d@dd       |d   D &cg c]  }&|& }A}&|d   D &cg c]  }&|& }B}&|d   D &cg c]  }&|& }C}&|d   d&kD  r|d   }D |j:                  d|D d       |d   d&kD  r|d   }D |j:                  d|D d        |j:                  t/        B       d        |j:                  t/        A       d       t?        |d   j=                         d+      }; |j(                  |;|       |d   D ]3  }&|d   |&   t        k  s |j4                  dD|& dF|d   |&    d       d}!5 g }E|d   D ]7  }&|d   |&   t        k  st        |&v sEj                  |& d|d   |&           9 t/        E      d&kD  rE |j4                  dt/        E       d        |j4                  ddj-                  |E      |       d}!t{        A      t{        C      z
  r9 |j4                  ddj-                  t{        A      t{        C      z
        |       t{        C      t{        A      z
  r9 |j4                  ddj-                  t{        C      t{        A      z
        |       |!r |j(                  dt         d|       t/        |d         dkD  r( |j4                  ddj-                  |d          d       |d   d&kD  r |j0                  d|d    d       |d   d&kD  r |j0                  d|d    d       d|v r |j                  d       |d   }F|d   }G |j:                  t/        |F       d        |j:                  t/        |G       d       |G|Fz
  }Ht/        |H      d&k7  r=t/        H      t/        G      z  }I |j:                  t/        |H       d|Id6z  dd       n |j:                  d       |d   d&kD  r|d   }J |j4                  |J d       |d   d&kD  r|d   }J |j4                  |J d       |d   d&kD  r|d   }J |j4                  |J d       n |j"                  d       |d   d&kD  r|d   }J |j4                  |J d       n |j"                  d«       |d   d&kD  r|d   }J |j:                  |J dĝ       n |j"                  dū       |d   d&kD  r|d   }J |j:                  |J dƝ       n |j"                  dǫ        |j                  dȫ       |j                  t        j                     }K|j                  t        j                     }L|j                  t        j                     }M|Kr |j"                  K d|Kdk(  rdnd˛ d̝       Lr |j4                  L d|Ldk(  rdndΛ        Mr4 |j0                  M d|Mdk(  rdndЛ        t        j                  d       y y # 1 sw Y   xY wc c}w c c}w c c}#w # 1 sw Y   xY w# 1 sw Y   xY wc c}0}/w c c}0}/w c c}'}&w # 1 sw Y   xY wc c}&w c c}&w c c}&w c c}&w )N)no_printprettyr>   )rJ   training)schemafrozen_componentszData file validationtrain_corpus
dev_corpusc                              S N )nlprT   s   rK   <lambda>zdebug_data.<locals>.<lambda>   s    <, rM   z%Pipeline can be initialized with datazCorpus is loadableT)	make_projFtextszTraining statsz
Language: zTraining pipeline: , z!Components from other pipelines: zFrozen components: z training docsz evaluation docszNo evaluation docsz* training examples also in evaluation dataz/No overlap between training and evaluation dataz0Low number of examples to train a new pipeline ()z!It's recommended to use at least z examples (minimum )showzVocab & Vectorsn_wordsz total word(s) in the data (wordsz unique)n_misaligned_wordsr   z' misaligned tokens in the training dataz" misaligned tokens in the dev data
   z10 most common words: )countszfloret vectors with z
 vectors, z dimensions, -z char n-gram subwordsz
 vectors (z unique keys, z dimensions)words_missing_vectorsz3{} words in training data without vectors ({:.0f}%)r0   z(10 most common words without vectors: {}z&No word vectors present in the packagespancatspancat_singlelabelzSpan Categorizationz	Spans KeyLabels)headerdividerzLabel counts in train data: zKey: zLabel 'z-' is not present in the model labels of key 'z*'. Performance may degrade after training.z"Low number of examples for label 'z
' in key '' (zAnalyzing label distribution...z)No examples for texts WITHOUT new label ''z!Obtaining span characteristics...z$Span characteristics for spans_key 'z8SD = Span Distinctiveness, BD = Boundary Distinctivenessspans_length)	thresholdzOver z % of spans have lengths of 1 -- z (min=
min_lengthz, max=
max_lengthz%). The most common span lengths are: z. If you are using the n-gram suggester, note that omitting infrequent n-gram lengths can greatly improve speed and memory usage.z#Full distribution of span lengths: avg_sdz5Spans may not be distinct from the rest of the corpusz.Spans are distinct from the rest of the corpusp_spansz10 most common span tokens: {}avg_bdz<Boundary tokens are not distinct from the rest of the corpusz8Boundary tokens are distinct from the rest of the corpusp_boundsz'10 most common span boundary tokens: {}z<To train a new span type, your data should include at least z instances of the new labelz&Good amount of examples for all labelszpTraining data should always include examples of spans in context, as well as examples without a given span type.z5Examples without occurrences available for all labelsnerc              3   *   K   | ]  }|d vs|  yw)Ore   NNrX   .0labels     rK   	<genexpr>zdebug_data.<locals>.<genexpr>{  s      
FV9VE
s   	zNamed Entity Recognitionz	 label(s)z) missing value(s) (tokens with '-' label)zEmpty label found in train datazLabels in train data: z|Some model labels are not present in the train data. The model performance may be degraded for these labels after training: .ws_entsz  invalid whitespace entity spansboundary_cross_entsz, entity span(s) crossing sentence boundariesz<No entities consisting of or starting/ending with whitespacez(No entities crossing sentence boundariesz>To train a new entity type, your data should include at least zuTraining data should always include examples of entities in context, as well as examples without a given entity type.z`Entity spans consisting of or starting/ending with whitespace characters are considered invalid.textcatz'Text Classification (Exclusive Classes)zText Classification: zLabels: catszWPotential train/dev mismatch: the train and dev labels are not the same. Train labels: z. Dev labels: r   zThe model does not have enough labels. 'textcat' requires at least two labels due to mutually-exclusive classes, e.g. LABEL/NOT_LABEL or POSITIVE/NEGATIVE for a binary classification task.n_cats_bad_valueszMUnsupported values for cats: the supported values are 1.0/True and 0.0/False.n_cats_multilabelzThe train data contains instances without mutually-exclusive classes. Use the component 'textcat_multilabel' instead of 'textcat'.zThe dev data contains instances without mutually-exclusive classes. Use the component 'textcat_multilabel' instead of 'textcat'.textcat_multilabelz Text Classification (Multilabel)zPotential train/dev mismatch: the train data contains instances without mutually-exclusive classes while the dev data contains only instances with mutually-exclusive classes.zThe train data contains only instances with mutually-exclusive classes. You can potentially use the component 'textcat' instead of 'textcat_multilabel'.zTrain/dev mismatch: the dev data contains instances without mutually-exclusive classes while the train data contains only instances with mutually-exclusive classes.taggerzPart-of-speech Taggingtagsz label(s) in train dataz  is the normalised label entropymorphologizerzMorphologizer (POS+Morph)morphsparserzDependency ParsingzFound n_sentsz' sentence(s) with an average length of z.1fz words.g?zThe training data contains z.2fz sentences per document. When there are very few documents containing more than one sentence, the parser will not learn how to segment longer texts into sentences.deps	n_nonprojz  nonprojective train sentence(s)z nonprojective dev sentence(s)z% label(s) in projectivized train dataz: zLow number of examples for z label(s) in the projectivized dependency trees used for training. You may want to projectivize labels such as punct before training in order to improve parser performance.z3Projectivized labels with low numbers of examples: z7The following labels were found only in the train data:z5The following labels were found only in the dev data:z5To train a parser, your data should include at least z instances of each label.rootsr%   zMultiple root labels (zq) found in training data. spaCy's parser uses a single root label ROOT so this distinction will not be available.z. nonprojective projectivized train sentence(s)n_cyclesz, projectivized train sentence(s) with cyclestrainable_lemmatizerzTrainable Lemmatizerlemmatizer_treesz. lemmatizer trees generated from training dataz) lemmatizer trees generated from dev dataz lemmatizer trees (z7% of dev trees) were found exclusively in the dev data.z/All trees in dev data present in training data.n_low_cardinality_lemmasz) training docs with 0 or 1 unique lemmas.z$ dev docs with 0 or 1 unique lemmas.no_lemma_annotationsz) training docs with no lemma annotations.z)All training docs have lemma annotations.z$ dev docs with no lemma annotations.z$All dev docs have lemma annotations.partial_lemma_annotationsz. training docs with partial lemma annotations.z2All training docs have complete lemma annotations.z) dev docs with partial lemma annotations.z-All dev docs have complete lemma annotations.Summary checkchecksz passedwarningwarningserrorerrors)Mr   r-   r   load_configload_model_from_configconfiginterpolater"   resolver   r!   
pipe_namesget_pipe_metafactoryrk   r#   
initializegoodlist_compile_goldtextlangjoinlenfailintersectionrG   BLANK_MODEL_THRESHOLDBLANK_MODEL_MIN_THRESHOLDinfomost_common_format_labelsvocabvectorsmodeVectorsModefloretvectors_lengthminnmaxnn_keyssumvaluesformat_get_labels_from_spancattableitemskeysNEW_LABEL_THRESHOLDloading_get_examples_without_label_get_span_characteristics_print_span_characteristics_get_spans_length_freq_dist_filter_spans_length_freq_dist SPAN_LENGTH_THRESHOLD_PERCENTAGEmax_format_freqsSPAN_DISTINCT_THRESHOLDr   BOUNDARY_DISTINCT_THRESHOLDset_get_labels_from_modelzipnumpyarraylog2DEP_LABEL_THRESHOLDr   appendrd   r   GOODWARNFAILsysexit)Pr<   rC   r>   r?   r@   rD   r   cfgr   Tsourced_componentsrS   presume_componentspipelinepipefactory_names	dot_namesrU   train_datasetdev_datasetgold_train_datagold_train_unpreprocessed_datagold_dev_datatrain_texts	dev_textsoverlapr   r`   n_misalignedmost_common_wordsn_missing_vectorsmodel_labels_spancathas_low_data_warninghas_no_neg_warning	spans_keydata_labelsdata_labels_in_componentr|   countspans_key_in_modelneg_docsspan_characteristics_span_freqs_filtered_span_freqsrs   all_span_tokensw_most_common_spansru   all_span_bound_tokensmost_common_boundslabelslabel_countsmodel_labelshas_ws_ents_errorhas_boundary_cross_ents_warningmissing_valueslabels_with_countsmissing_labels
label_listrd   norm_entropysents_per_doclabels_trainlabels_train_unpreprocessed
labels_devr   rare_projectivized_labelstrees_train	trees_devdev_not_trainpctngood_countswarn_countsfail_countsrY   rT   sP                                                                                 @@rK   rI   rI   {   sE    IC 
{	+ N{6FG))#.'')VJ/8LM	N 04-.$6Uq!CT:TUU~~HADPS&&t,44PMPCKK&' >"AlO4I0CL*NN,-CHH45c*+Mz#'KCHH!" $M=#QUVO%2}cU&" "+}cTRM!'*Kg&I-.CKK !CHHz#(($%CHH"499X#6"7894TYY?P5Q4RST&tyy1B'C&DEFCHHM"#>23CHHK !!123}%&+**956GG9FGHBC ]!36K!KA#mBTAUUVW} 99CHHTNCHHTN/0E/F G12!5	
 CKK!"i(GCHH)/OG4L0M/NhW +,q0&';<L>!HIJ)*Q.$%9:L>!CDE'0<<R@CHH
 0A$!O PQ 39999!![%7%77CHH&s399+<+<'=&>j99++,M99$$))*!CII,=,=,B,B+C D"# CHHsyy(()**SYY5F5F5M5M4N O  #		 8 89G !$O4K$L$S$S$U VCHHELL%,y/IIJ CHH:AA"'(?@LLRP#  	9:M!%:m%K7<$")*		&X/FPTU/g>&5i&@&F&F&H 	"I{CHH	{"^K4E4E4GPT%U$VW	 2668$
 y1)<<$
  $
 '?&D&D&F W	"I{ + 1 1 3 .u%.2F2K2K2M%M"&!5i!@@CHH!%(UV_U` aB B
 //CHH<UG:i[X[\a[bbcd ,0( S[[!BC :%ui H q=CHHHqQR)-&/.2 @A '@!?I($
 CHH;I;aHICHHOP'(<=5/	:K $B'G$  CHH899Y+00234 5,\:;6BVWcBdAe f55BCW5X4Y Z   CHH5mK6P5QR $H-0GGPQIJ*95<<>G'*7GI'>O/>/J/J2/N Otq! O OCHH077"#45 	 $H-0KKWXST+J7>>@H-079-E!0E0Q0QRT0U!V1!!V!VCHH9@@"#56 	eW	r  CHH,--HJ CHH=>CHH 	 CHHLM 
.u5
 
 'u--c59$"!*/'./C%&i01%c*N##LMN 	<E5zQ:;	<
 !- 8 8 :
u| EN
 

 ,,>tL)*<)=>WM%.CHH+N;<A?
 9%CHH	233STU $ 	.EE"&998s<PUCVBWWXY (,$ S[[!BC X:=%QVWHXq=CHHHqQR)-&	. 01CHH"#89::fg /3+#CHH=>!CHHLM CHHST.CHH?@CHH,--HJ
 CHH 	 CHHE
 M!=>'Y7(VY?@8N62347C#of&=">>CHH+N;<A?
 v&'3}V/D+EECHH!!/0G!H I J-mF.CDEQH v;?CHH' /01401A5CHH* ./!3 CHH
 ,-1CHH },67'-AB(VY?@8N62347C#of&=">>CHH+N;<A?
 v&'3}V/D+EECHH!!/0G!H I J-mF.CDEQH /01401A5CHH* ./!301Q6 CHHG
 01A5O = ,- /&"9"?"?"AB
FC
O$$;<=KKKUZZ]*//1EJJs:4OOL>!ABC-c8<Z%.CHH+N;<A?
 ,F#//1$
 	#'2-'/0)8)BCeC
C-c?CC
O$$;<=Z%.CHH+N;<A?
 ,H%113D
 	#'2= $() 	_Y/0 1(3oi6PPQTUU\^	
 (	2S9Q5RR3CHH-mC-@ A/ 0 ,;6+BC%CC=fE'
E'
# '
 *7v)>?e?
?)+6:6{CICHHvi[(HIJ%)%k2ICHHvi[(FGHC3455LMNC%&&KLM+*62>>@
 	#'2 4F; 	,E-f5e<@SS8 @6v>uEFaI (,$	, %'!$V, 	E'.2EE&)00gR 7 >?@	 ()A-CHH-c2K.L-M NJ J CHHE		34
 $(  |s:.CHHI		#l+c*o=> z?S..CHHG		#j/C,==>  CHH,--FH -g67!;CHHII<WEFG HHI ;'!+CHH56 72 3 :&*CHH455ab .*+ /0B C+,>?	 	C$%%STUC	N##LMN!K/}"m$s9~5CCHH}%&&9#)C I; ; CHHFG56: :;ACHHsCDE34q889ACHHs>?@12Q6 67ACHHsCDECHH@A/01445ACHHs>?@CHH;<67!; ;<ACHHsHIJCHHIJ4599:ACHHsCDECHHDECKK	**X]]+K**X]]+K**X]]+KK=[A-='8!LGTUK={a/?)Z!PQRK=[A-='8!LMN AN N VPn$
,  N !P  "W\
2X XB DJ D'
 @s   A%A]	A](A]"A]!6A]&"A]++A]8.A^A^%A^&A^B	A^$F,	A^)F>	A^.G	A^3]A]]+A]5]8A^	^A^!		file_pathr   returnc                    | j                   d   }| j                  dk(  rK |j                  d| d      5  t        j                  |       }d d d         |j
                  d|        S | j                  dk(  rK |j                  d| d      5  t        j                  |       }d d d         |j
                  d|        S  |j                  d| j                   dd	
       y # 1 sw Y   xY w# 1 sw Y   OxY w)Nz.jsonzLoading z...zLoaded z.jsonlzCan't load file extension zExpected .json or .jsonlr%   )exits)partssuffixr   srsly	read_jsonr   
read_jsonlr   )r  r   	file_namer2   s       rK   
_load_filer%    s    #I7"S[[8I;c23 	.??9-D	.79+&'			X	%S[[8I;c23 	/##I.D	/79+&'CHH
$Y%5%5$67"	. 	.
	/ 	/s   C'C3'C03C<examplesr   rY   r[   c                    i dt               dt               dt               dt               dt               dt               dt               dt               d	t               d
t               dt               dddddddddt               ddddddt               t               dddd	}d|v rt        |j                  j
                        }| D ]Q  }|j                  }|j                  }|D 	cg c]  }	|	j                   }
}	|d   j                  |
       |dxx   t        |
      z  cc<   |j                  }|D ]Q  }|j                  j                         r|j                  j                  |j                      dk7  sE|dxx   dz  cc<   S |d   j#                  |j                         t        |j                  j$                        rd|D cg c]  }|j                   c}D ]G  }|j                  j
                  |   |j                  j$                  vs3|d   j                  |g       I d|v r|j'                         }t)        |j+                               D ]  \  }}|	|j-                  d      r||   j.                  r|dxx   dz  cc<   |j-                  d      rt1        |      }|d   |xx   dz  cc<   ||   r|j-                  d      r|dxx   dz  cc<   |dk(  s|d   dxx   dz  cc<    d|v sd|v rt3        |j                  j4                  j7                               D ]o  }||d   vrt               |d   |<   t)        |j                  j4                  |         D ]/  \  }}|j8                  |d   |   |j8                  xx   dz  cc<   1 ||d	   vrt               |d	   |<   |j4                  |   D ]e  }|j8                  |j8                  |d	   |   vrg |d	   |   |j8                  <   |d	   |   |j8                     j;                  t        |             g ||d
   vrt               |d
   |<   |j4                  |   D ]O  }|j8                  |d
   |   vrg |d
   |   |j8                  <   |d
   |   |j8                     j;                  |       Q d}||d   vrt               |d   |<   |j4                  |   D ]  }|j8                  |d   |   vrg g d|d   |   |j8                  <   t=        |      D ]  }|j>                  |dz   z
  }|dk\  r-|d   |   |j8                     d   j;                  |||dz           |j@                  |dz   z   }|t        |      k  sh|d   |   |j8                     d   j;                  ||dz
  |          r d|v sd |v r|d   j                  |jB                         tE        d! |jB                  jG                         D              r|d"xx   dz  cc<   t3        |jB                  jG                               jI                  d      dk7  r|d#xx   dz  cc<   d$|v r8|jK                  d%d&'      }|d   j                  |D 	cg c]  }	|	|		 c}	       d(|v r|jK                  d)d&'      }|jK                  d*d&'      }tM        ||      D ]  \  }}|||d+k(  r|d+k(  rtO        jP                  |      }|r||tR        jT                  <   |j                  j                  j
                  |j                  j                  jV                  j#                  |         }|d   j                  |g        d,|v r|jY                  |-      \  }} |d   j                  | D 	cg c]  }	|	|		 c}	       t)        tM        | |            D ]0  \  }\  }!}"|"|k(  s|d   j                  |!g       |dxx   dz  cc<   2 t[        j\                  |      r|d.xx   dz  cc<   t[        j^                  |      r|d/xx   dz  cc<   d|v s^ta        d0 |D              r|d1xx   dz  cc<   tE        d2 |D              r|d3xx   dz  cc<   t               }#|D ]x  }|jb                  dk7  s|#j#                  |jb                         j#                  |j                  |jd                        }$|jg                  |$      }%|d4   j#                  |%       z t        |#      d5k  s5t        |      dkD  sE|d6xx   dz  cc<   T |S c c}	w c c}w c c}	w c c}	w )7Nrv   r   r   r   r   ra   r   rg   rn   spans_per_typesb_per_typer   r   r   r`   rb   rf   r   )	r   r   r   r   r\   r   r   r   r   r   r%   r\   )B-U-L-)r*  r+  )zI-r,  re   rh   )startendr-  r.  r   r   c              3   $   K   | ]  }|d v 
 yw))r   r%   NrX   )r{   vals     rK   r}   z _compile_gold.<locals>.<genexpr>  s     C3f$Cs   r   r   r   TAGT)	as_stringr   POSMORPH r   )projectivizer   r   c              3   :   K   | ]  }|j                   d k(    ywr   Nlemmar{   tokens     rK   r}   z _compile_gold.<locals>.<genexpr>       65;;!#6   r   c              3   :   K   | ]  }|j                   d k(    ywr8  r9  r;  s     rK   r}   z _compile_gold.<locals>.<genexpr>  r=  r>  r   r   r   r   )4r   dictr   r   r   strings	reference	predictedr   updater   	alignmentorth_isspacex2ylengthsiaddr   get_aligned_sent_starts	enumerateget_aligned_ner
startswithis_spacer    r   spansr   label_r   ranger-  r.  r   anyr   r   get_alignedr   r   feats_to_dictr   POS_FEAT
morphologyget_aligned_parser   is_nonproj_treecontains_cycleallr:  lemma_tree_to_str)&r&  r   rY   r[   r2   treeseggolddocxvalid_wordsalignr<  twordsent_startsrJ  r|   combined_labelr   spanwindow_sizeoffsetsb_start_idx
sb_end_idxr   pos_tagsr   posmorph
label_dictaligned_headsaligned_depsdephead	lemma_settree_idtree_strs&                                         rK   r   r      s	   wy	 		 	')	
 		 	 	 	46 	 	$& 	tv 	1 	q 	1 	a  	 !" 	1#$ E !%&$%5D8 .#))++, S6||ll'+,!qvv,,W[)Y3{++ 	0E{{""$yy  )Q.)*a/*		0
 	W#((#syy  !),-A- A99$$T*#))2C2CC0188$@A M!446K%b&8&8&:; *5=##$67CFOOOq(O##L1%7%>NK/14/q>e&6&6|&D./14/c\K$)$* %)>-)O!",,"4"4"9"9";< 3	DO318DOI.(););I)FG EGAt{{* Y	24;;?1D?	E D$886:fD(3 JJy1 SD{{* {{$~*>y*IIGI^,Y7D(3DKK@GGD	RS D)9$::8<D)*95 JJy1 PD{{$/?*@*KKIK-.y9$++F)*95dkkBII$OP  D$7759VD'	2 JJy1 D{{$}*=i*HH &(#%G]+I6t{{C #("4 '+zzVaZ'@'1, /	:4;;GPWW $\L14D E &*XX!%<
%T2 /	:4;;GNUU $Z!^j AA3j %)=)NL		*C		0@0@0BCC()Q.)DII$$&'--a0A5()Q.)}$>>%4>8DLD BqAM BCm+~~et~<H^^Gt^<F!(F3 3
U
 ;%- BY5B; ",!9!9%!@J=@
=#9#9:LL..66**5599*EE N))5'2'3( }$*,*>*>I*>*V'M<LL JqAM JK"+Cm,L"M );C19M((#/Oq(O) &&}5[!Q&!%%m4Z A% !]2666+,1,66601Q61I ;;;!#MM%++.#ii

ELLAG$009H+,00:; 9~!c$i!m/0A50gS6h Kc - .` !C4 !Ks$   7d.d35d8
=d8
d=
d=
r  rd   c                      y rW   rX   r  rd   s     rK   r   r     s    RUrM   c                      y rW   rX   r{  s     rK   r   r     s     rM   c                 &   |rMdj                  t        t        t        t        t
        f      |       D cg c]  \  }}d| d| d c}}      S dj                  t        t        t           |       D cg c]  }d| d
 c}      S c c}}w c c}w )Nr]   rm   rl   r^   )r   r   r   r   strint)r  rd   lcs       rK   r   r     s     yy(,XeCHo-F(OP1q3qc^P
 	
 99Xc]F(CD1!AhDEE QDs   B
4Bfreqssortc           
      T   |r"t        t        | j                                     } | j                         D cg c]  \  }}t        |      |f }}}dj	                  t        t        t        t        t        f      |      D cg c]  \  }}| d| d c}}      S c c}}w c c}}w )Nr]   z (z%))	r@  sortedr   r~  r   r   r   r   float)r  r  kv_freqsr  r  s          rK   r   r     s    VEKKM*+&+kkm4das1vqk4F499#'sEz1B(CV#LM41aA3b2M  5Ms   BB$
r|   	component)rv   rg   r   c                 >   d}| D ]  }|dk(  r*|j                         D cg c]  }|dvrt        |       }}|dk(  rK||j                  j                  v r1|j                  j                  |   D cg c]  }|j                   c}ng }vs|dz  } |S c c}w c c}w )Nr   rv   rx   rg   r%   )rN  r    rB  rQ  rR  )r2   r|   r  r   r   r`  r  rj  s           rK   r   r     s     E   //1 00 #5)F  	!  2 22 *,););I)FGG  QJE!" L Hs   B/Bfactory_namec                    | j                   D cg c]"  }| j                  |      j                  |k(  r|$ }}t               }|D ]@  }| j	                  |      }t        |t              sJ |j                  |j                         B |S c c}w rW   )	r   r   r   r   get_pipe
isinstancer   rD  r  )rY   r  	pipe_namer   r  r   s         rK   r   r     s     Y'//<? 	J 
 uF #	||I&$...dkk"# Ms   'B	c                 n   | j                   D cg c]!  }| j                  |      j                  dv r|# }}i }|D ]r  }| j                  |      }t	        |t
              sJ |j                  |vrt               ||j                  <   ||j                     j                  |j                         t |S c c}w )N)rg   rh   )
r   r   r   r  r  r   keyr   rD  r  )rY   r  r   r  r   s        rK   r   r   $  s     Y'//3UU 	J 
 #%F -	||I&$000886!"uF488txx,- Ms   &B2r  c                 x    t        j                  t        j                  d | D              t        |       z        S )z Compute geometric mean of a listc              3   F   K   | ]  }t        j                  |        y wrW   )mathlog)r{   rJ  s     rK   r}   z_gmean.<locals>.<genexpr>6  s     5adhhqk5s   !)r  expfsumr   )r  s    rK   _gmeanr  4  s)    88DII5155A>??rM   metricfrequenciesc                     t        fd| j                         D              }|t        j                               z  S )Nc              3   4   K   | ]  \  }}||   z    y wrW   rX   )r{   	span_typevaluer  s      rK   r}   z_wgt_average.<locals>.<genexpr>:  s      V3C9eI..Vs   )r   r   r   )r  r  totals    ` rK   _wgt_averager  9  s2    Vv||~VVE3{))+,,,rM   	normalizec           	      h   t               }| D ]P  }|D ]I  }|j                  j                         j                  dd      j                  dd      }||xx   dz  cc<   K R |rJt	        |j                         d      }t        |j                         D ci c]  \  }}|||z   c}}      }|S c c}}w )z2Get the frequency distribution given a set of Docsz``"z''r%           )r   r   lowerreplacer   r   r   )	docsr  word_countsrb  r<  rf  r  r  r  s	            rK   _get_distributionr  >  s    "9K   	 E

  "**45==dCHANaN	  
 K&&(#.8I8I8KL1q!e)|LM Ms   B.
r   qc                 |    d}| j                         D ]&  \  }}||t        j                  |||   z        z  z  }( |S )zHCompute the Kullback-Leibler divergence from two frequency distributionsr  )r   r  r  )r   r  r  rg  p_words        rK   _get_kl_divergencer  L  sF    E	 5f$((6AdG#34445LrM   	span_datac           	          |D ci c]  gt        fd| D              z    }}t        |j                               S c c}w )z*Compile into one list for easier reportingc              3   :   K   | ]  }t        |           y wrW   )r(   )r{   dr|   s     rK   r}   z#_format_span_row.<locals>.<genexpr>W  s     J1nQuX6Js   )r   r   )r  r  r|   r  s     ` rK   _format_span_rowr  T  sQ     	 	wJ	JJJJ	A 	 
		s    Acompiled_goldc                 |   |d   |   }|d   |   j                         D ci c]  \  }}|t        |       }}}|d   |   j                         D ci c]  \  }}|t        |       }}}|d   |   j                         D cg c]  }t	        |       }	}|d   |   j                         D cg c]  }t        |       }
}t        | D cg c]  }|j                   c}d      }|d   |   j                         D ci c]  \  }}|t        |d       }}}|d   |   j                         D ci c]  \  }}|t        |d   |d   z   d       }}}|j                         D ci c]  \  }}|t        ||       }}}|j                         D ci c]  \  }}|t        ||       }}}||||t	        |	      t        |
      t        ||      t        ||      t        ||      t        |j                               ||d	S c c}}w c c}}w c c}w c c}w c c}w c c}}w c c}}w c c}}w c c}}w )
zObtain all span characteristicsrg   rn   r(  T)r  r)  r-  r.  )sdbdr(  rI  rp   rq   rr   rt   
avg_lengthr  rs   ru   )r   r  r   r   minr   r  rB  r  r  r   r   )r&  r  r   r   r|   r  span_lengthrQ  r(  min_lengthsmax_lengthsr`  p_corpusrs   sbru   	freq_distspan_distinctivenesssb_distinctivenesss                      rK   r   r   ]  s     	*95K &n5i@FFHE1 	vayK  **:;IFLLNE5 	s5zN  $1#@#K#R#R#TUa3q6UKU#0#@#K#R#R#TUa3q6UKU !!B2",,!BdSH **:;IFLLNE5 	 $77G  '}5i@FFHE2 	 Gr%y!8DIIH  !(E9 	!)X66  !) 0E9 	!)X66  # (+&+&3[A1;?";<{'')* A VU "Cs5   HHH1HH!H&7!H,.H2H8r   c                 p   d}t        dt        d | d   D                    }| d   | d   | d   | d   g}t        || d   	      }| d
   | d   | d   g}dg|D cg c]  }dj                  t        |d             c}z   dgz   }t	        j
                  |||ddgdgt        |      dz   z  z   |       yc c}w )z+Print all span characteristics into a table)z	Span TypeLengthSDBDN   c              3   2   K   | ]  }t        |        y wrW   )r   rz   s     rK   r}   z._print_span_characteristics.<locals>.<genexpr>  s     Q#e*Qs   r  rI  r  r  r(  )r  r  r  rr   rt   zWgt. Averagez{:.2f}r   re   Tr  rr%   )footerrj   rk   alignsmax_colN)r   r  r   roundr   r   r   )r   headersr  
table_datar   footer_datafr  s           rK   r   r     s    6G"cQ2Fx2PQQRG 	Y'T"T"-.	J %9(%CE
 	\*X&X&K 
+NQHOOE!QK8NNRUQVV  IIuuK 01 455 Os   "B3length_dictc                 >   g }| j                         D ]  \  }}|j                  |        t               }|D ]&  }|j                  |      r||xx   dz  cc<   "d||<   ( i }|j	                         D ]'  \  }}	|	t        |      z  dz  }
t        |
d      }
|
||<   ) |S )zDGet frequency distribution of spans length under a certain thresholdr%   g      Y@r   )r   extendr   getr   r   r  )r  ro   all_span_lengthsr   rI  r  rJ  freq_dist_percentager  r   
percentages              rK   r   r     s     !'') )
7() !I ==aLALIaL	 '335 7Uc"233u<
:q)
,6[)7
  rM   r  ro   c                 b    d}i }| j                         D ]  \  }}||k\  r |S |||<   ||z  } |S )zFilter frequency distribution with respect to a threshold

    We're going to filter all the span lengths that fall
    around a percentage threshold when summed.
    r  )r   )r  ro   r  filtered_freq_distr  dists         rK   r   r     s[     E&__. TI  /3{+ rM   )F)T)rv   sc)_r  r   collectionsr   pathlibr   typingr   r   r   r   r	   r
   r   r   r   r   r   r   r!  typerwasabir   r   r   r5  r   compatr   languager   rX  r   r   r   r   r   (pipeline._edit_tree_internals.edit_treesr   pipeline._parser_internalsr   "pipeline._parser_internals.nonprojr   schemasr   rQ   r   r    training.initializer!   r"   r#   r   r$   r   _utilr&   r'   r(   r)   r*   r+   r,   r-   r   r   r   r   r   r   r   rE   ContextboolrL   r~  rI   r%  r   r   r  r  r   r   r   r   r  r  r  r  r  r   r   r   r   rX   rM   rK   <module>r     s    
         ) )    # D D @ 0 : * 2 8 . )	 	 	       #%   
$RVW *.$O ' !$]!  :	 {D'V }e*J-1	1 1 ~1 1& '1, -11n (*!O
O
 38nO
 	O

 O
 O
 O
d$ W  &xwx9x 
x 	x
 
#s(^xv 
 U8C= U'%. US U 
 U 
U38_%DM 	 
 F(3-%S/!::;FF 	Fc5j)    ,1#	
7
 '( }	
 	6  C ( tCSM/B  @d @u @
-c5j) - -E -
t w ' g % T
 DI $s) 37m3,0cN3GJ3	#s(^3l d38n  H "B  	#u* 2CJ,/	#u*rM   