
    3jv                     d   U d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
 ddlmZmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z.  ededd      Z/dZ0dZ1 e2h d      Z3e2e4   e5d<    e2h d      Z6e2e7   e5d<    e2h d      Z8e2e7   e5d<    e2h d      Z9e2e7   e5d<    e2h d       Z:e2e7   e5d!<   e6e8e9e:d"Z;e<e4e2e7   f   e5d#<    e2h d$      Z=e2e7   e5d%<   d&d'd(Z>e<e4e4f   e5d)<   d*e?d+ed,e2e4   d-efd.Z@d/e4d,e2e4   d0e4d-eAe   fd1ZBd/e4d*e?d-eCfd2ZDd3ZEd4ZFd5ZGd6ZHd7ZId*e?d8eJe-d9f   d:ed-eJe-d9f   fd;ZKd*e?d<eAeJe4eLf      d8eJe-d9f   d:ed-eAe   f
d=ZMd*e?d>eAe   d-eAe   fd?ZNd*e?d>eAe   d-eAe   fd@ZOdAZPd*e?d/e4d-e?dz  fdBZQd*e?d>eAe   d-eAe   fdCZRd*e?d>eAe   d-eAe   fdDZSefdddEdFdGd*e?dHedIe7dJe2e4   dz  dKe2e4   dz  dLe4dMe4d-eAe   fdNZTefdddEdFdGd*e?dHedIe7dJe2e4   dz  dKe2e4   dz  dLe4dMe4d-eAe   fdOZUy)Pu   Pipeline orchestrator — runs all detection stages in sequence.

Note: ``from __future__ import annotations`` is intentionally omitted because
this module is compiled with mypyc, which does not support PEP 563 string
annotations.
    N)DEFAULT_MAX_BYTES)EncodingEra)BigramProfilehas_model_variantsinfer_languagescore_best_language)_NONE_RESULTDETERMINISTIC_CONFIDENCE
HIGH_BYTESDetectionResultPipelineContext)detect_ascii)	is_binary)
detect_bom)resolve_confusion_groups)detect_escape_encoding)detect_magic)detect_markup_charset)score_candidates)compute_lead_byte_diversitycompute_multibyte_byte_coveragecompute_structural_score)detect_utf8)detect_utf1632_patterns)filter_by_validity)REGISTRYEncodingInfoget_candidatesapplication/octet-stream)encoding
confidencelanguage	mime_typeg333333?i @  >   	iso8859-1
iso8859-15cp1252_COMMON_LATIN_ENCODINGS>.                                                                                                                                             _ISO_8859_10_DISTINGUISHING>   r(   r)   r+   r,   r-   r.   r0   r1   r2   r4      r5   r6   r7   r8   r9      r;   r<   r=   r>   r?   r@   rA   rB      rJ         rS      _ISO_8859_14_DISTINGUISHING>   rY      rZ   r[      r\   _WINDOWS_1254_DISTINGUISHING>   rC                     rD   rE      rF      rG            rH            rK   r^   rZ   _HP_ROMAN8_DISTINGUISHING)z
iso8859-10z
iso8859-14cp1254z	hp-roman8_DEMOTION_CANDIDATES>                           r(   r)   r,   r9   _KOI8_T_DISTINGUISHINGcp932cp949)shift_jis_2004euc_kr_MARKUP_SUPERSET_PROMOTIONSdatamarkup_resultallowedreturnc                    |j                   |S t        j                  |j                         }|||vr|S t        |   }	 | j	                  |d       t               }t        | t        |j                      |      }t        | ||      }||kD  r,t        ||j                  |j                  |j                        S |S # t
        t        f$ r |cY S w xY w)aE  Promote a markup-declared encoding to its superset when structural evidence supports it.

    If the declared encoding has a known superset, the superset validates the
    data, and the superset's structural score is materially better, return a
    new result using the superset encoding.  Otherwise return the original.
    stricterrors)r    r   getr   decodeUnicodeDecodeErrorLookupErrorr   r   r   r!   r"   r#   )r   r   r   superset_namesuperset_infoctx
base_scoresuperset_scores           J/DATA/.local/lib/python3.12/site-packages/chardet/pipeline/orchestrator.py_try_promote_markup_supersetr      s     %/33M4J4JKMW <]+MM(3 
C)$9O9O0PRUVJ-dM3GN
"$$""##	
 	
  , s    B; ;CCr    
param_namec                 z    | |vr)t        j                  | d| dt        d       t        gS t	        | dd      gS )zReturn a low-confidence result for *encoding*, or ``encoding=None`` if filtered out.

    ``stacklevel=5`` targets the public caller:
    detect() -> run_pipeline() -> _run_pipeline_core() -> _make_fallback_or_none().
     zL is excluded by include_encodings/exclude_encodings; returning encoding=None   )
stacklevelg?N)r    r!   r"   )warningswarnUserWarningr	   r   )r    r   r   s      r   _make_fallback_or_noner     sQ     wl!H< (K L		
 ~X$NOO    c                 ^    t         j                  |       yt        fd|D               S )au  Return True if encoding is a demotion candidate with no distinguishing bytes.

    Checks whether any non-ASCII byte in *data* falls in the set of byte
    values that decode differently under the given encoding vs iso-8859-1.
    If none do, the data is equally valid under both encodings and there is
    no byte-level evidence for preferring the candidate encoding.
    Fc              3   2   K   | ]  }|d kD  s	|v   yw   N ).0bdistinguishings     r   	<genexpr>z!_should_demote.<locals>.<genexpr>7  s     A1D1&s   
	)rq   r   any)r    r   r   s     @r   _should_demoter   ,  s2     *--h7NAAAAAr   g?   gffffff?      valid_candidates.r   c                 4   g }|D ]  }|j                   rt        | ||      }||j                  |j                  <   |t        k  r@|j
                  0t        |       t        | j                  dt                    z
  |_        |j
                  t        k  rt        | |||j
                        }||j                  |j                  <   |t        k  r|j
                  t        k\  rt        | ||      }|t        k  r|j!                  |       	 t#        |      S )a  Eliminate CJK multi-byte candidates that lack genuine multi-byte structure.

    Four checks are applied in order to each multi-byte candidate:

    1. **Structural pair ratio** (valid_pairs / lead_bytes) must be
       >= ``_CJK_MIN_MB_RATIO``.  Catches files with many orphan lead bytes.

    2. **Minimum non-ASCII byte count**: the data must contain at least
       ``_CJK_MIN_NON_ASCII`` bytes > 0x7F.  Tiny files with 1-5 high bytes
       can accidentally form perfect pairs and score 1.0 structurally.

    3. **Byte coverage** (non-ASCII bytes in valid multi-byte sequences /
       total non-ASCII bytes) must be >= ``_CJK_MIN_BYTE_COVERAGE``.  Latin
       text has many high bytes that are NOT consumed by multi-byte pairs;
       genuine CJK text has nearly all high bytes accounted for.

    4. **Lead byte diversity**: the number of distinct lead byte values in
       valid pairs must be >= ``_CJK_MIN_LEAD_DIVERSITY``.  Genuine CJK text
       draws from a wide repertoire of lead bytes; European false positives
       cluster in a narrow band (e.g. 0xC0-0xDF for accented Latin).

    Returns the filtered candidate list.  Structural scores are cached in
    ``ctx.mb_scores`` for reuse in Stage 2b.
    N)non_ascii_count)is_multibyter   	mb_scoresname_CJK_MIN_MB_RATIOr   len	translater   _CJK_MIN_NON_ASCIIr   mb_coverage_CJK_MIN_BYTE_COVERAGE_CJK_DIVERSITY_MIN_NON_ASCIIr   _CJK_MIN_LEAD_DIVERSITYappendtuple)r   r   r   gatedencmb_scorebyte_coveragelead_diversitys           r   _gate_cjk_candidatesr   X  s   : !#E/c3?H&.CMM#((#++""*&)$i#dnnT:6V2W&W#""%77;c30C0CM )6COOCHH%55""&BB!<T3!L!$;;S)  * <r   structural_scoresc           	      V   |D ci c]  }|j                   s|j                  | c}t        fd|D              }t        d |D              }t        t	        | dt
         g ||            }g }|D ]  }	|	j                  r&|j                  j                  |	j                  d      nd}
|
dk\  rL|j                  t        |	j                  |	j                  d|
z   z  |	j                  |	j                               |j                  |	        |j                  d d	       |S c c}w )
a  Score structurally-valid CJK candidates using statistical bigrams.

    When multiple CJK encodings score equally high structurally, statistical
    scoring differentiates them (e.g. euc-jp vs big5 for Japanese data).
    Single-byte candidates are also scored and included so that the caller
    can compare CJK vs single-byte confidence.

    Multi-byte candidates with high byte coverage (>= 0.95) receive a
    confidence boost proportional to coverage.  When nearly all non-ASCII
    bytes form valid multi-byte pairs, the structural evidence is strong
    and should increase the candidate's ranking relative to single-byte
    alternatives whose bigram models may score higher on small samples.

    Note: boosted confidence values may exceed 1.0 and are used only for
    relative ranking among candidates.  ``run_pipeline`` clamps all
    confidence values to [0.0, 1.0] before returning to callers.
    c              3   8   K   | ]  \  }}|v s|     y wNr   )r   r   _sc
enc_lookups      r   r   z/_score_structural_candidates.<locals>.<genexpr>  s&      *;YT3tz?Q
4*;s   
c              3   :   K   | ]  }|j                   r|  y wr   )r   )r   es     r   r   z/_score_structural_candidates.<locals>.<genexpr>  s     J#3a1>>#3s   N        gffffff?   c                     | j                   S r   )r!   xs    r   <lambda>z._score_structural_candidates.<locals>.<lambda>  s    q||r   Tkeyreverse)r   r   r   listr   _STAT_SCORE_MAX_BYTESr    r   r   r   r   r!   r"   r#   sort)r   r   r   r   r   valid_mbsingle_byteresultsboostedrcoverager   s              @r   _score_structural_candidatesr     s   0 ,++aq~~	++J  *; H J#3JJK4457P7PK7PQG
 &(G;<::3??&&qzz373tNNJJH =qzz1;; NN1  LL+TL:N1+s
   D&D&r   c                    t        |      dkD  r|d   j                  t        |d   j                  |       r|d   j                  }|d   j                  }|dd D ]  }|j                  t        v st        |j                  ||j                  |j                        }|D cg c]  }|j                  |k7  s||us| }}|D cg c]  }|j                  |k(  s| }}|g||c S  |S c c}w c c}w )a  Demote niche Latin encodings when no distinguishing bytes are present.

    Some bigram models (e.g. iso-8859-10, iso-8859-14, windows-1254) can win
    on data that contains only bytes shared with common Western Latin
    encodings.  When there is no byte-level evidence for the winning
    encoding, promote the first common Western Latin candidate to the top and
    push the demoted encoding to last.
    r   r   N)r   r    r   r!   r'   r   r"   r#   )	r   r   demoted_encodingtop_confr   promotedr   othersdemoted_entriess	            r   _demote_niche_latinr     s    	GqAJ+71:..5"1:..1:((Azz44*JJ!**akk  '&!!**8H*HQVWZAw   /6"XgGW9W1g"X <6<O<<  N #Ys    C+5C+:C+C0C0c                 r   |r|d   j                   dk7  r|S t        d t        |      D        d      }||S t        d | D              rh||   }|d   j                  }t        |j                   ||j                  |j                        }t        |      D cg c]  \  }}||k7  s| }}}|g|S |S c c}}w )a  Promote KOI8-T over KOI8-R when Tajik-specific bytes are present.

    KOI8-T and KOI8-R share the entire 0xC0-0xFF Cyrillic letter block,
    making statistical discrimination difficult.  However, KOI8-T maps 12
    bytes in 0x80-0xBF to Tajik-specific Cyrillic letters where KOI8-R has
    box-drawing characters.  If any of these bytes appear, KOI8-T is the
    better match.
    r   zkoi8-rc              3   F   K   | ]  \  }}|j                   d k(  s|  yw)zkoi8-tN)r    )r   ir   s      r   r   z!_promote_koi8t.<locals>.<genexpr>  s"     Q$6DAq!**:Pa$6s   !!Nc              3   8   K   | ]  }|d kD  s	|t         v   ywr   )rz   )r   r   s     r   r   z!_promote_koi8t.<locals>.<genexpr>  s     
A1D1&&s   
)r    next	enumerater   r!   r   r"   r#   )	r   r   	koi8t_idxkoi8t_resultr   r   r   r   r   s	            r   _promote_koi8tr     s     gaj))X5QIg$6QSWXI

A
AAy)1:(("!!!!""	
 !*' 2E 21a9n! 2E"6""N Fs   B3%B3i   c                     |dk(  r| S 	 | j                  |d      j                  dd      S # t        t        t        f$ r Y yw xY w)aP  Decode data from encoding and re-encode as UTF-8 for language scoring.

    Returns None if the encoding is unknown. For UTF-8, returns data as-is.
    Uses ``errors="ignore"`` because the data already passed byte-validity
    filtering for the detected encoding; any residual invalid bytes are
    irrelevant for language scoring.
    utf-8ignorer   surrogatepassN)r   encoder   	TypeError
ValueError)r   r    s     r   _to_utf8r     s[     7{{8H{5<<O = 
 	
 J/ s   #- AAc           	         g }d}d}|D ]G  }|j                   }||j                  t        |j                        }|?| r=t        |j                        r(|t	        |       }t        | |j                  |      \  }}|R| rPt        d      rEt        | |j                        }|r-||j                  dk7  rt	        |      }t        |d|      \  }}|j                  }	|	|j                  dnd}	||j                   k7  s|	|j                  k7  r3|j                  t        |j                  |j                  ||	             7|j                  |       J |S )a  Fill in language and mime_type for results missing them.

    **Language** (only for text results where ``encoding is not None``):

    Tier 1: single-language encodings via hardcoded map (instant).
    Tier 2: multi-language encodings via statistical bigram scoring (lazy).
    Tier 3: decode to UTF-8, score against UTF-8 language models (universal fallback).

    **MIME type**: text results default to ``"text/plain"``, binary results
    (``encoding is None``) default to ``"application/octet-stream"``.
    N)profiler   z
text/plainr   )r"   r    r   r   r   r   r   r#   r   r   r!   )
r   r   filledr   utf8_profileresultlang_	utf8_datamimes
             r   _fill_metadatar     sM    %'F$(G)-L<FOO7!&//2D|);FOO)L?+D1G-dFOOWU4|);G)D$T6??;	#+v'/I'4Y'?1!7LGAt < ??. /  6??"df.>.>&>MM1B1BD$O MM&!C D Mr   c                 J    t        | |      }t        | |      }t        | |      S )zGApply confusion resolution, niche Latin demotion, and KOI8-T promotion.)r   r   r   )r   r   s     r   _postprocess_resultsr   O  s)    
 'tW5G!$0G$((r   r&   r   include_encodingsexclude_encodingsno_match_encodingempty_input_encodingencoding_era	max_bytesr  r  r  r  c                   t               }| d| } t        |||      }t        d |D              }	| st        ||	d      S t	        |       }
|
|
j
                  |	v r|
gS t        |       }||j
                  |	v r|gS t        |       }||j
                  |j
                  |	v r|gS t        |       }||gS t        |       }t        |       }||t        | |      rt        gS t        |       }||j
                  |	v rt        | ||	      }|gS ||j
                  |	v r|gS ||j
                  |	v r|gS t        | |      }|st        ||	d      S t!        | ||      }|st        ||	d      S g }|D ]f  }|j"                  s|j$                  j'                  |j(                        }|t+        | ||      }|dkD  sJ|j-                  |j(                  |f       h |rA|j/                  d d	       |d
   \  }}|t0        k\  rt3        | |||      }|rt5        | |      S | dt6         }t9        t;        |t=        |                  }|st        ||	d      S t5        | |      S )zBCore pipeline logic. Returns list of results sorted by confidence.Nc              3   4   K   | ]  }|j                     y wr   )r   )r   r   s     r   r   z%_run_pipeline_core.<locals>.<genexpr>k  s     'GJSJs   r  )r  r  r   c                     | d   S )Nr   r   r   s    r   r   z$_run_pipeline_core.<locals>.<lambda>  s    QqTr   Tr   r   )r   r   	frozensetr   r   r    r   r   r   r   r   r   _BINARY_RESULTr   r   r   r   r   r   r   r   r   r   r    _STRUCTURAL_CONFIDENCE_THRESHOLDr   r   r   r   r   r   )r   r  r  r  r  r  r  r   
candidatesr   
bom_resultutf1632_resultescape_resultmagic_resultutf8_precheckascii_precheckr   r   r   r   scorer   
best_scorer   	stat_datas                            r   _run_pipeline_corer  Y  s    
C
D
  .?ARSJ''GJ'GGG% '+A
 	
 D!J*"5"5"@|
 -T2N!n&=&=&H
 +40M!"".""g-  %L~  %M "$'N
 	"di0
 *$/M ]%;%;w%F4T='R !n&=&=&H  ]%;%;w%F *$
;%&7BUVV ,D2BCH%&7BUVV 24MM%%chh/E}0sC@s{!((#((E):;   >4@)!,:992')93G +D'::
 ++,I#Iu5E/FGHG%&7BUVVg..r   c          
      2   t        | ||||||      }t        | dt         |      }|sd}t        |      |D 	cg c]S  }	|	j                  dkD  r@t        |	j                  t        |	j                  d      |	j                  |	j                        n|	U c}	S c c}	w )aU  Run the full detection pipeline.

    :param data: The raw byte data to analyze.
    :param encoding_era: Filter candidates to a specific era of encodings.
    :param max_bytes: Maximum number of bytes to process.
    :param include_encodings: If not ``None``, only return these encodings.
    :param exclude_encodings: If not ``None``, never return these encodings.
    :param no_match_encoding: Encoding returned when no candidate survives.
    :param empty_input_encoding: Encoding returned for empty input.
    :returns: A list of :class:`DetectionResult` sorted by confidence descending.
    r   Nz/pipeline must always return at least one resultg      ?)
r  r   _LANG_SCORE_MAX_BYTESRuntimeErrorr!   r   r    minr"   r#   )
r   r  r  r  r  r  r  r   msgr   s
             r   run_pipeliner    s    * !+++1G T"8#897CG?3 	 A <<# 	

Cc$:AJJT	 	  s   AB)V__doc__r   chardet._utilsr   chardet.enumsr   chardet.modelsr   r   r   r   chardet.pipeliner	   r
   r   r   r   chardet.pipeline.asciir   chardet.pipeline.binaryr   chardet.pipeline.bomr   chardet.pipeline.confusionr   chardet.pipeline.escaper   chardet.pipeline.magicr   chardet.pipeline.markupr   chardet.pipeline.statisticalr   chardet.pipeline.structuralr   r   r   chardet.pipeline.utf8r   chardet.pipeline.utf1632r   chardet.pipeline.validityr   chardet.registryr   r   r   r  r  r   r
  r'   str__annotations__rV   intr]   r`   ro   rq   dictrz   r   bytesr   r   r   boolr   r   r   r   r   r   r   r   floatr   r   r   r  r   r   r   r  r  r   r   r   <module>r7     s    , %   0 - + ? : / 9 9 
 . < 8 C C '(	 $(     +4+ 3  /8/1/ Ys^ 1n /8 "/ Ys^ "T 09(0 in  -6- 9S> : .-**	3 d3	#./  *3L* 	#  / T#s(^ !
!"! s^! 	!HPPs^P P 
/	P*BS B B$ B$          " 3
3L#-.3 
3 <	3l/
/E#u*-./ L#-./ 
	/
 
//d
/" 
/@
/" 
/H  5 C EDL $3
3/3	/3l)
)/") 
/) 'J/
 04/3% 'J/
J/J/ J/
 !~,J/ !~,J/ J/ J/ 
/J/` ',
 04/3% ',
,, ,
 !~,, !~,, , , 
/,r   