
    3j>                        U d Z ddlmZ ddlmZmZ ddlmZ dede	e
eef   fdZdede	e
eef   fdZdede	e
eef   fd	Zdede	e
eef   fd
Zdede	e
eef   fdZdede	e
eef   fdZdede	e
eef   fdZdede	e
eef   fdZdede	e
eef   fdZeeeeeeeedZeeeege	e
eef   f   f   ed<   dededede	e
eef   dz  fdZdededede
fdZ	 ddededededz  de
f
dZdedededefdZy)a  Stage 2b: Multi-byte structural probing.

Computes how well byte patterns in the data match the expected multi-byte
structure for a given encoding.  Used after byte-validity filtering (Stage 2a)
to further rank multi-byte encoding candidates.

Note: ``from __future__ import annotations`` is intentionally omitted because
this module is compiled with mypyc, which does not support PEP 563 string
annotations.
    )Callable)
HIGH_BYTESPipelineContext)EncodingInfodatareturnc                    d}d}d}t               }d}t        |       }||k  r| |   }d|cxk  rdk  sn d|cxk  rdk  rdn na|dz  }|dz   |k  rN| |dz      }d|cxk  rdk  sn d	|cxk  rd
k  r.n n+|dz  }|j                  |       |dz  }|dkD  r|dz  }|dz  }|dz  }n|dz  }||k  r|dkD  r||z  nd}	|	|t        |      fS )zSingle-pass Shift_JIS structural analysis.

    Lead bytes: 0x81-0x9F, 0xE0-0xEF
    Trail bytes: 0x40-0x7E, 0x80-0xFC

    Returns (pair_ratio, mb_bytes, lead_diversity).
    r                  @   ~                       setlenadd
r   
lead_countvalid_countmbleadsilengthbtrailratios
             H/DATA/.local/lib/python3.12/site-packages/chardet/pipeline/structural.py_analyze_shift_jisr%      s    JK	
BeE	AYF
f*GA41#4#4!OJ1uv~QUE)T)tu/D/D1$KIIaL!GBt|aFAFAFA# f*$ )3QK*$CE"c%j      c                    d}d}d}t               }d}t        |       }||k  r| |   }d|cxk  rdk  sn d|cxk  rdk  rdn na|dz  }|dz   |k  rN| |dz      }d|cxk  rdk  sn d	|cxk  rdk  r.n n+|dz  }|j                  |       |dz  }|d
kD  r|dz  }|dz  }|dz  }n|dz  }||k  r|dkD  r||z  nd}	|	|t        |      fS )aB  Single-pass CP932 structural analysis.

    Lead bytes: 0x81-0x9F, 0xE0-0xFC
    Trail bytes: 0x40-0x7E, 0x80-0xFC

    Extends Shift_JIS by raising the lead byte ceiling from 0xEF to 0xFC,
    covering IBM vendor-defined characters (NEC-selected, IBM extensions).

    Returns (pair_ratio, mb_bytes, lead_diversity).
    r   r
   r   r   r   r   r   r   r   r   r   r   r   r   s
             r$   _analyze_cp932r(   E   s    JK	
BeE	AYF
f*GA41#4#4!OJ1uv~QUE)T)tu/D/D1$KIIaL!GBt|aFAFAFA# f*$ )3QK*$CE"c%j  r&   c                    d}d}d}t               }d}t        |       }||k  r| |   }|dk(  rH|dz  }|dz   |k  r5d| |dz      cxk  rdk  r$n n!|dz  }|j                  |       |dz  }|dz  }R|dz  }n|dk(  r\|dz  }|dz   |k  rId| |dz      cxk  rdk  r8n n5d| |dz      cxk  rdk  r$n n!|dz  }|j                  |       |d	z  }|d	z  }|dz  }n\d|cxk  rdk  rLn nI|dz  }|dz   |k  r6d| |dz      cxk  rdk  r%n n"|dz  }|j                  |       |dz  }|dz  }
|dz  }n|dz  }||k  r|dkD  r||z  nd
}||t        |      fS )zSingle-pass EUC-JP structural analysis.

    Two-byte: Lead 0xA1-0xFE, Trail 0xA1-0xFE
    SS2 (half-width katakana): 0x8E + 0xA1-0xDF
    SS3 (JIS X 0212): 0x8F + 0xA1-0xFE + 0xA1-0xFE

    Returns (pair_ratio, mb_bytes, lead_diversity).
    r      r         r            r   r   	r   r   r   r   r   r   r    r!   r#   s	            r$   _analyze_euc_jpr1   n   s    JK	
BeE	AYF
f*G9!OJ1uv~$$q1u+"="=q 		!aQFA$Y!OJADQK/4/DQK/4/q 		!aQFAQ$!OJ1uv~$$q1u+"="=q 		!aQFAFAI f*J )3QK*$CE"c%j  r&   c                 F   d}d}d}t               }d}t        |       }||k  rf| |   }d|cxk  rdk  rKn nH|dz  }|dz   |k  r5d| |dz      cxk  rdk  r$n n!|dz  }|j                  |       |dz  }|dz  }Z|dz  }n|dz  }||k  rf|dkD  r||z  nd}||t        |      fS )zSingle-pass EUC-KR structural analysis.

    Lead 0xA1-0xFE; Trail 0xA1-0xFE

    Returns (pair_ratio, mb_bytes, lead_diversity).
    r   r+   r.   r   r   r   r   r0   s	            r$   _analyze_euc_krr3      s     JK	
BeE	AYF
f*G1!OJ1uv~$$q1u+"="=q 		!aQFAFA f* )3QK*$CE"c%j  r&   c                    d}d}d}t               }d}t        |       }||k  r| |   }d|cxk  rdk  sn d|cxk  rdk  rqn nn|dz  }|dz   |k  r[| |dz      }d|cxk  rdk  sn d	|cxk  rd
k  sn d|cxk  rdk  r.n n+|dz  }|j                  |       |dz  }|dkD  r|dz  }|dz  }|dz  }n|dz  }||k  r|dkD  r||z  nd}	|	|t        |      fS )at  Single-pass CP949 (Unified Hangul Code) structural analysis.

    Lead bytes: 0x81-0xC8, 0xCA-0xFD
    Trail bytes: 0x41-0x5A, 0x61-0x7A, 0x81-0xFE

    Extends EUC-KR by lowering the lead byte floor from 0xA1 to 0x81 and
    adding ASCII letter trail ranges plus 0x81-0xA0.  0xC9 is not a valid
    UHC lead byte.

    Returns (pair_ratio, mb_bytes, lead_diversity).
    r   r
            r   A   Z   a   z   r.   r   r   r   r   r   s
             r$   _analyze_cp949r<      s    JK	
BeE	AYF
f*GA41#4#4!OJ1uv~QUU*d*----1$KIIaL!GBt|aFAFAFA+ f*, )3QK*$CE"c%j  r&   c                 ,   d}d}d}t               }d}t        |       }||k  r| |   }d|cxk  rdk  rn n|dz  }|dz   |k  r]d| |dz      cxk  rdk  rLn nId| |dz      cxk  rdk  r8n n5d| |dz      cxk  rdk  r$n n!|dz  }|j                  |       |dz  }|d	z  }d
|cxk  rdk  r@n n=|dz   |k  r5d
| |dz      cxk  rdk  r$n n!|dz  }|j                  |       |dz  }|dz  }|dz  }n|dz  }||k  r|dkD  r||z  nd}||t        |      fS )a  Single-pass GB18030 / GB2312 structural analysis.

    Only counts strict GB2312 2-byte pairs (lead 0xA1-0xF7, trail 0xA1-0xFE)
    and GB18030 4-byte sequences.  The broader GBK extension range
    (lead 0x81-0xFE, trail 0x40-0x7E / 0x80-0xFE) is intentionally excluded
    because it is so permissive that unrelated single-byte data (EBCDIC, DOS
    codepages, etc.) can score 1.0, leading to false positives.

    Returns (pair_ratio, mb_bytes, lead_diversity).
    r   r
   r.   r   r/   0   9   r      r+      r   r   r0   s	            r$   _analyze_gb18030rB      sY    JK	
BeE	AYF
f*G1!OJ ADQK/4/DQK/4/DQK/4/q 		!aQq D QUV^QU8St8Sq 		!aQFAFA3 f*4 )3QK*$CE"c%j  r&   c                 x   d}d}d}t               }d}t        |       }||k  r| |   }d|cxk  rdk  rdn na|dz  }|dz   |k  rN| |dz      }d|cxk  rdk  sn d|cxk  rdk  r.n n+|dz  }|j                  |       |dz  }|dkD  r|dz  }|d	z  }s|dz  }n|dz  }||k  r|dkD  r||z  nd
}	|	|t        |      fS )zSingle-pass Big5 structural analysis.

    Lead 0xA1-0xF9; Trail 0x40-0x7E, 0xA1-0xFE

    Returns (pair_ratio, mb_bytes, lead_diversity).
    r   r+      r   r   r   r.   r   r   r   r   r   s
             r$   _analyze_big5rE   '  s     JK	
BeE	AYF
f*G1!OJ1uv~QUE)T)tu/D/D1$KIIaL!GBt|aFAFAFA# f*$ )3QK*$CE"c%j  r&   c                 x   d}d}d}t               }d}t        |       }||k  r| |   }d|cxk  rdk  rdn na|dz  }|dz   |k  rN| |dz      }d|cxk  rdk  sn d|cxk  rdk  r.n n+|dz  }|j                  |       |dz  }|dkD  r|dz  }|d	z  }s|dz  }n|dz  }||k  r|dkD  r||z  nd
}	|	|t        |      fS )aW  Single-pass Big5-HKSCS structural analysis.

    Lead bytes: 0x87-0xFE
    Trail bytes: 0x40-0x7E, 0xA1-0xFE

    Extends Big5 by lowering the lead byte floor from 0xA1 to 0x87 and
    raising the ceiling from 0xF9 to 0xFE.  0x7F and 0x80-0xA0 are not
    valid Big5/HKSCS trail bytes.

    Returns (pair_ratio, mb_bytes, lead_diversity).
    r      r.   r   r   r   r+   r   r   r   r   r   s
             r$   _analyze_big5hkscsrH   L  s     JK	
BeE	AYF
f*G1!OJ1uv~QUE)T)tu/D/D1$KIIaL!GBt|aFAFAFA# f*$ )3QK*$CE"c%j  r&   c                    d}d}d}t               }d}t        |       }||k  r| |   }d|cxk  rdk  sn d|cxk  rdk  sn d|cxk  rdk  rin nf|dz  }|dz   |k  rS| |dz      }d	|cxk  rd
k  sn d|cxk  rdk  r3n n0|dz  }|j                  |       |dkD  r|dz  }|dkD  r|dz  }|dz  }|dz  }n|dz  }||k  r|dkD  r||z  nd}	|	|t        |      fS )zSingle-pass Johab structural analysis.

    Lead: 0x84-0xD3, 0xD8-0xDE, 0xE0-0xF9
    Trail: 0x31-0x7E, 0x91-0xFE

    Returns (pair_ratio, mb_bytes, lead_diversity).
    r               r   rD   r   1   r      r.   r   r   r   r   r   s
             r$   _analyze_johabrP   v  s    JK	
BeE	AYF
f*GA41#4#4$!:Kt:K!OJ1uv~QUE)T)tu/D/D1$KIIaL4xat|aFAFAFA# f*$ )3QK*$CE"c%j  r&   )shift_jis_2004cp932euc_jis_2004euc_krcp949gb18030	big5hkscsjohab
_ANALYZERSnamectxNc                     |j                   j                  |      }||S t        j                  |      }|y ||       }||j                   |<   |S )z/Return cached analysis or compute and cache it.N)analysis_cachegetrY   )r   rZ   r[   cachedanalyzerresults         r$   _get_analysisrb     sZ     ##D)F~~d#Hd^F%CtMr&   encoding_infoc                 ^    | r|j                   syt        | |j                  |      }|y|d   S )a  Return 0.0--1.0 indicating how well *data* matches the encoding's structure.

    For single-byte encodings, always returns 0.0.  For empty data, always
    returns 0.0.

    :param data: The raw byte data to analyze.
    :param encoding_info: Metadata for the encoding to probe.
    :param ctx: Pipeline context for caching analysis results.
    :returns: A structural fit score between 0.0 and 1.0.
    r   r   is_multibyterb   rZ   r   rc   r[   ra   s       r$   compute_structural_scorerh     s8     }114!3!3S9F~!9r&   non_ascii_countc                     | r|j                   syt        | |j                  |      }|y|d   }||n*t        |       t        | j	                  dt
                    z
  }|dk(  ry||z  S )av  Ratio of non-ASCII bytes that participate in valid multi-byte sequences.

    Genuine CJK text has nearly all non-ASCII bytes paired into valid
    multi-byte sequences (coverage close to 1.0), while Latin text with
    scattered high bytes has many orphan bytes (coverage well below 1.0).

    :param data: The raw byte data to analyze.
    :param encoding_info: Metadata for the encoding to probe.
    :param ctx: Pipeline context for caching analysis results.
    :param non_ascii_count: Pre-computed count of non-ASCII bytes, or ``None``
        to compute from *data*.
    :returns: A coverage ratio between 0.0 and 1.0.
    r   Nr   r   )rf   rb   rZ   r   	translater   )r   rc   r[   ri   ra   mb_bytes	non_asciis          r$   compute_multibyte_byte_coveragern     s|    & }114!3!3S9F~ayH & 	YT^^D*=>> 
 A~ir&   c                 ^    | r|j                   syt        | |j                  |      }|y|d   S )a  Count distinct lead byte values in valid multi-byte pairs.

    Genuine CJK text uses lead bytes from across the encoding's full
    repertoire.  European text falsely matching a CJK structural scorer
    clusters lead bytes in a narrow band.

    :param data: The raw byte data to analyze.
    :param encoding_info: Metadata for the encoding to probe.
    :param ctx: Pipeline context for caching analysis results.
    :returns: The number of distinct lead byte values found.
    r      r   re   rg   s       r$   compute_lead_byte_diversityrq     s8     }114!3!3S9F~!9r&   )N)__doc__collections.abcr   chardet.pipeliner   r   chardet.registryr   bytestuplefloatintr%   r(   r1   r3   r<   rB   rE   rH   rP   rY   dictstr__annotations__rb   rh   rn   rq    r&   r$   <module>r~      sD  	 % 8 ) #!
#!
5#s?#!L&!
&!
5#s?&!R7!
7!
5#s?7!t!
!
5#s?!@+!
+!
5#s?+!\.!
.!
5#s?.!b"!
"!
5#s?"!J'!
'!
5#s?'!T#!
#!
5#s?#!V )##	D
DhweS#o(>>??@ 	
!0
5#s?d"(
 ,3B
6 #'	$ 
$ $  
$  4Z	$ 
 $ N
 ,3Br&   