
    3j5,                         U d Z ddlZddlmZmZmZ dZdZdZdZ	dZ
d	Zd
Zdez   Zeed<   dededefdZdededz  fdZdededz  fdZdededz  fdZdedefdZddededefdZy)a  Stage 1a+: UTF-16/UTF-32 detection for data without BOM.

This stage runs after BOM detection but before binary detection.
UTF-16 and UTF-32 encoded text contains characteristic null-byte patterns
that would otherwise cause binary detection to reject the data.

Note: ``from __future__ import annotations`` is intentionally omitted because
this module is compiled with mypyc, which does not support PEP 563 string
annotations.
    N)ASCII_TEXT_BYTESDETERMINISTIC_CONFIDENCEDetectionResulti      
   gQ?      ?gffffff?g333333?    _NULL_SEPARATOR_ALLOWEDdata	null_fracreturnc                 D    |t         k\  ry| j                  dt               S )u  Return True if the data looks like ASCII with null byte separators.

    :param data: The raw byte sample to examine.
    :param null_frac: The positional null fraction for this UTF-16 candidate
        (i.e. fraction of null bytes in even positions for BE, or odd positions
        for LE) — not the total null fraction across all bytes.

    Checks two conditions:
    1. The positional null fraction is below ``_NULL_SEPARATOR_MAX_FRACTION``
    2. Every non-null byte is printable ASCII or common whitespace

    When both conditions are met, the nulls are likely field separators
    (e.g. ``find -print0``), not UTF-16 encoding artifacts.
    FN)_NULL_SEPARATOR_MAX_FRACTION	translater
   )r   r   s     E/DATA/.local/lib/python3.12/site-packages/chardet/pipeline/utf1632.py_is_null_separator_patternr   6   s%     00~~d$;<<<    c                 n    | dt          }t        |      t        k  ryt        |      }||S t	        |      S )a  Detect UTF-32 or UTF-16 encoding from null-byte patterns.

    UTF-32 is checked before UTF-16 since UTF-32 patterns are more specific.

    :param data: The raw byte data to examine.
    :returns: A :class:`DetectionResult` if a strong pattern is found, or ``None``.
    N)_SAMPLE_SIZElen_MIN_BYTES_UTF16_check_utf32_check_utf16)r   sampleresults      r   detect_utf1632_patternsr   J   sD     -< F
6{%% &!F r   c           	          t               t               dz  z
  }|t        k  ry d|  |dz  }t         fdt        dt               d      D              }t         fdt        dt               d      D              }||k(  r8||z  dkD  r0	  j	                  d      }t        |      rt        dt        d      S 	 t         fd	t        d
t               d      D              }t         fdt        dt               d      D              }||k(  r9||z  dkD  r1	  j	                  d      }t        |      rt        dt        d      S 	 yy# t        $ r Y w xY w# t        $ r Y yw xY w)a  Check for UTF-32 encoding based on 4-byte unit structure.

    For valid Unicode (U+0000 to U+10FFFF = 0x0010FFFF):
    - UTF-32-BE: the first byte of each 4-byte unit is always 0x00
    - UTF-32-LE: the last byte of each 4-byte unit is always 0x00

    For BMP characters (U+0000 to U+FFFF), additionally:
    - UTF-32-BE: the second byte is also 0x00
    - UTF-32-LE: the third byte is also 0x00
       Nc              3   4   K   | ]  }|   d k(  sd  ywr      N .0ir   s     r   	<genexpr>z_check_utf32.<locals>.<genexpr>t        J#9aT!W\#9   r   c              3   :   K   | ]  }|d z      dk(  sd   yw)r!   r   Nr"   r#   s     r   r&   z_check_utf32.<locals>.<genexpr>v   s#     O$:qd1q5kQ>N$:s   r   z	utf-32-beencoding
confidencelanguagec              3   4   K   | ]  }|   d k(  sd  ywr    r"   r#   s     r   r&   z_check_utf32.<locals>.<genexpr>   s     I"8QDGqLq"8r(      c              3   4   K   | ]  }|   d k(  sd  ywr    r"   r#   s     r   r&   z_check_utf32.<locals>.<genexpr>   r'   r(      z	utf-32-le)	r   _MIN_BYTES_UTF32sumrangedecode_looks_like_textr   r   UnicodeDecodeError)r   trimmed_len	num_unitsbe_first_nullbe_second_nulltextle_last_nullle_third_nulls   `       r   r   r   `   sx    d)s4y1}-K%%Dq I J5CIq#9JJMOE!SY$:OON	!ny&@3&F		;;{+D%&(7!  & I%3t9a"8IILJ5CIq#9JJMy ]Y%>%D		;;{+D%&(7!  & ) " 		" " 		s$   -E -E 	EE	E'&E'c                 &    t        t               t              }||dz  z  }|t        k  ry|dz  }t	         fdt        d|d      D              }t	         fdt        d|d      D              }||z  }||z  }g }|t        k\  r"t         d| |      s|j                  d|f       |t        k\  r"t         d| |      s|j                  d|f       |syt        |      dk(  r<|d   d   }	  d| j                  |      }	t        |	      rt        |t        d	      S 	 yd}
d
}|D ]/  \  }}	  d| j                  |      }	t        |	      }||kD  s,|}|}
1 |
|t        k\  rt        |
t        d	      S y# t        $ r Y yw xY w# t        $ r Y jw xY w)a  Check for UTF-16 via null-byte patterns in alternating positions.

    UTF-16 encodes each BMP character as two bytes.  For characters whose
    code-point high byte is 0x00 (Latin, digits, basic punctuation, many
    control structures), one of the two bytes in each unit will be a null.
    Even for non-Latin scripts (Arabic, CJK, Cyrillic, etc.) a significant
    fraction of code units still contain at least one null byte.

    Non-UTF-16 single-byte encodings never contain null bytes, so even a
    small null-byte fraction in alternating positions is a strong signal.

    When both endiannesses show null-byte patterns (e.g., Latin text where
    every other byte is null), we disambiguate by decoding both ways and
    comparing text-quality scores.
    r1   Nc              3   4   K   | ]  }|   d k(  sd  ywr    r"   r#   s     r   r&   z_check_utf16.<locals>.<genexpr>        K#:ad1gl#:r(   r   c              3   4   K   | ]  }|   d k(  sd  ywr    r"   r#   s     r   r&   z_check_utf16.<locals>.<genexpr>   rA   r(   r!   z	utf-16-lez	utf-16-ber*         )minr   r   r   r3   r4   _UTF16_MIN_NULL_FRACTIONr   appendr5   r6   r   r   r7   _text_quality_MIN_TEXT_QUALITY)r   
sample_lenr9   be_null_countle_null_countbe_fracle_frac
candidatesr+   r<   best_encodingbest_quality_qualitys   `             r   r   r      s     SY-J*q. J$$aI K5J#:KKMK5J#:KKMi'Gi'G*,J**3M[j74 	;01**3M[j74 	;01 :!a=#		$++H5D%&%7!  &  !%ML!!	$++H5D  %\!"L$M "  \5F%F"/
 	
 5 " 		 " 		s$   ,0E5 ,F5	F F	FFr<   c                 `    | sy| dd }t        d |D              }|t        |      z  t        kD  S )z9Quick check: is decoded text mostly printable characters.FN  c              3   J   K   | ]  }|j                         s|d v sd  yw)
	r!   N)isprintable)r$   cs     r   r&   z#_looks_like_text.<locals>.<genexpr>   s     Jv!AMAvs   ##)r3   r   _MIN_PRINTABLE_FRACTION)r<   r   	printables      r   r6   r6      s8    $3ZFJvJJIs6{"%<<<r   limitc                 z   | d| }t        |      }|dk(  ryd}d}d}d}d}|D ]d  }	t        j                  |	      }
|
d   dk(  r|dz  }t        |	      dk  s4|dz  }:|
d   dk(  r|dz  }H|
dk(  s|	d	v r|dz  }W|
d   d
k(  s`|dz  }f ||z  dkD  ry||z  dkD  ry||z  }|||z  dz  z  }|dkD  r
|dkD  r|dz  }|S )u  Score how much *text* looks like real human-readable content.

    Returns a score in the range [-1.0, ~1.6).  Higher values indicate
    more natural text.  The practical maximum is 1.5 for all-ASCII-letter
    input (1.6 approaches as sample size grows with all ASCII letters plus
    whitespace).  A score of -1.0 means the content is almost certainly not
    valid text (too many control characters or combining marks).

    Scoring factors:

    * Base score: ratio of Unicode letters (category ``L*``) to sample length.
    * ASCII bonus: additional 0.5x weight for ASCII letters.  This is the
      primary signal for disambiguating endianness — correct decoding of
      Latin-heavy text produces ASCII letters, wrong decoding produces CJK.
    * Space bonus: +0.1 when the sample contains at least one whitespace
      character and is longer than 20 characters.
    * Rejection: returns -1.0 if >10% control characters or >20% combining
      marks (category ``M*``).
    Nr   rC   Lr!      MZsrV   Cg?g?r      )r   unicodedatacategoryord)r<   r[   r   nlettersmarksspacescontrolsascii_lettersrX   catscores               r   rG   rG      s   ( &5\FFAAvGEFHM""1%q6S=qLG1v|"Vs]QJED[AMaKFVs]MH  !|cqy3aKE	ma3&&E2v&1*Lr   )rT   )__doc__rc   chardet.pipeliner   r   r   r   r2   r   rE   rH   rY   r   r
   bytes__annotations__floatboolr   r   r   r   strr6   intrG   r"   r   r   <module>rv      s   	  X X             $  ")+;!;  ;=U =u = =( %  Od,B  ,5u 54!7 5pQu Q4!7 Qh=3 =4 =9 9C 9% 9r   