
    3j.                     H   U d Z ddlZddlZddlZddlZddlZddlZddlm	Z	m
Z
  ej                  d      j                  Z ej                  d      j                  ZdZi Zeeef   ed<    e	j(                         D ]4  Z eej.                        dk(  sej.                  d   eej0                  <   6 d	ed
eeeef   eeef   f   fdZej<                  d
eeeef   eeef   f   fd       Zd
eeef   fdZ deeef   d
eee!eedz  eef      f   fdZ"ej<                  d
eee!eedz  eef      f   fd       Z#ded
edz  fdZ$ded
e%fdZ&d
eeef   fdZ'ej<                  d
e(fd       Z) G d d      Z*	 dde*de(ez  ded
efdZ+	 dd	edede*dz  d
eeedz  f   fdZ,y)zModel loading and bigram scoring utilities.

Note: ``from __future__ import annotations`` is intentionally omitted because
this module is compiled with mypyc, which does not support PEP 563 string
annotations.
    N)REGISTRYlookup_encodingz>Iz>ds   CMD2_SINGLE_LANG_MAP   datareturnc                 :   	 | dd t         k7  rd}t        |      d}t        | |      \  }|dz  }|dkD  rd| d}t        |      g }i }t        |      D ]p  }t        | |      \  }|dz  }|dkD  rd| d	}t        |      | |||z    j	                  d
      }||z  }t        | |      \  }	|dz  }|j                  |       |	||<   r t        j                  | |d       }
|dz  }t        |
      |k7  rdt        |
       d| }t        |      t        |
      }i }t        |      D ]  \  }}|dz  }|||dz    ||<    	 ||fS # t        j                  $ r}d| }t        |      |d}~wt        j                  t        f$ r}d| }t        |      |d}~ww xY w)zParse the v2 dense zlib-compressed models.bin format.

    :param data: Raw bytes of models.bin (must be non-empty).
    :returns: A ``(models, norms)`` tuple.
    :raises ValueError: If the data is corrupt or truncated.
    N   z&corrupt models.bin: missing CMD2 magici'  zcorrupt models.bin: num_models=z exceeds limit   zcorrupt models.bin: name_len=z exceeds 256zutf-8      z&corrupt models.bin: decompressed size z != expected zcorrupt models.bin: )	_V2_MAGIC
ValueError_unpack_uint32rangedecode_unpack_float64appendzlib
decompresslen
memoryview	enumerateerrorstructUnicodeDecodeError)r   msgoffset
num_modelsnamesnorms_name_lennamenormblobexpected_sizemvmodelsistartes                    D/DATA/.local/lib/python3.12/site-packages/chardet/models/__init__.py_parse_models_binr.      s   6%8y :CS/!&tV4!3J<~NCS/!"$z"A(v6KXaKF#~5hZ|L o%(!23::7CDhF%dF3GTaKFLLE$K #" tFG}-"U*t9%8T D,o/  S/! (* 'GAtIEeeem4F4L ( 5= :: %$QC(o1$LL,- %$QC(o1$%s$   D=E FE((FFFc                      t         j                  j                  d      j                  d      } | j	                         }|s t        j                  dt        d       i i fS t        |      S )zkLoad and parse models.bin, returning (models, norms).

    Cached: only reads from disk on first call.
    chardet.modelsz
models.binuX   chardet models.bin is empty — statistical detection disabled; reinstall chardet to fix   
stacklevel)		importlib	resourcesfilesjoinpath
read_byteswarningswarnRuntimeWarningr.   refr   s     r-   _load_models_datar>   a   sb     


#
#$4
5
>
>|
LC>>D'		
 2vT""    c                      t               d   S )zLoad all bigram models from the bundled models.bin file.

    Each model is a memoryview of length 65536 (256*256).
    Index: (b1 << 8) | b2 -> weight (0-255).

    :returns: A dict mapping model key strings to 65536-byte lookup tables.
    r   r>    r?   r-   load_modelsrC   v   s     q!!r?   r)   c                    i }| j                         D ]>  \  }}|j                  dd      \  }}|j                  |g       j                  |||f       @ t	        |      D ]  }t        |      }|||vs||   ||<    |S )zBuild a grouped index from a models dict.

    :param models: Mapping of ``"lang/encoding"`` keys to 65536-byte tables.
    :returns: Mapping of encoding name to ``[(lang, model, model_key), ...]``.
    /r   )itemssplit
setdefaultr   listr   )r)   indexkeymodellangencenc_name	canonicals           r-   _build_enc_indexrQ      s     BDElln
UIIc1%	cb!(($s);< % K#H-	 Ye%;$XE)  
 Lr?   c                  (    t        t                     S )zTReturn a pre-grouped index mapping encoding name -> [(lang, model, model_key), ...].)rQ   rC   rB   r?   r-   get_enc_indexrS      s     KM**r?   encodingc                 ,    t         j                  |       S )zReturn the language for a single-language encoding, or None.

    :param encoding: The canonical encoding name.
    :returns: An ISO 639-1 language code, or ``None`` if the encoding is
        multi-language.
    )r   getrT   s    r-   infer_languagerX      s     ))r?   c                     | t               v S )zReturn True if the encoding has language variants in the model index.

    :param encoding: The canonical encoding name.
    :returns: ``True`` if bigram models exist for this encoding.
    )rS   rW   s    r-   has_model_variantsrZ      s     }&&r?   c                      t               d   S )zAReturn cached L2 norms for all models, keyed by model key string.r   rA   rB   r?   r-   _get_model_normsr\      s    q!!r?   c                     t         j                  j                  d      j                  d      } | j	                         }t        |      dk7  r7t        j                  dt        |       dt        d       t        ddz        S t        |      S )	u  Return a 65536-byte IDF weight table for bigram profile construction.

    Loads a precomputed table from ``idf.bin`` (generated at training time).
    For each bigram index, the weight reflects how discriminative that bigram
    is across all models:

    - Bigrams in every model (common ASCII) → weight 1 (minimal signal)
    - Bigrams in one model → weight 255 (maximum signal)
    - Bigrams not in any model → weight 1 (unknown, treat as neutral)
    r0   zidf.binr   z chardet idf.bin has wrong size (z"), falling back to uniform weightsr1   r2      )
r4   r5   r6   r7   r8   r   r9   r:   r;   	bytearrayr<   s     r-   get_idf_weightsr`      s     


#
#$4
5
>
>y
IC>>D
4yE.s4yk :. .		
 5))T?r?   c                   H    e Zd ZdZdZdeddfdZedee	e	f   dd fd       Z
y)	BigramProfileu  Pre-computed bigram frequency distribution for a data sample.

    Computing this once and reusing it across all models reduces per-model
    scoring from O(n) to O(distinct_bigrams).

    Stores a dense ``freq`` list of length 65536 indexed by bigram index, plus
    a ``nonzero`` list of indices with non-zero frequency for fast iteration.
    Each bigram is weighted by its IDF (inverse document frequency) across all
    models — bigrams unique to few models get high weight, bigrams common to
    all models get weight 1.
    )freq
input_normnonzero
weight_sumr   r   Nc                    t        |      dz
  }|dk  rg | _        g | _        d| _        d| _        yt               }dgdz  }g }d}t        |      D ]C  }||   dz  ||dz      z  }||   }	||   dk(  r|j                  |       ||xx   |	z  cc<   ||	z  }E || _        || _        || _        d}
|D ]  }||   }|
||z  z  }
 t        j                  |
      | _        y)a?  Compute the bigram frequency distribution for *data*.

        Each bigram is weighted by its IDF (inverse document frequency) across
        all loaded models.  Bigrams unique to few models get high weight;
        bigrams common to all models get weight 1.

        :param data: The raw byte data to profile.
        r   r           Nr   r   )
r   rc   re   rf   rd   r`   r   r   mathsqrt)selfr   total_bigramsidfrc   re   w_sumr*   idxwnorm_sqvs               r-   __init__zBigramProfile.__init__   s    D	AA $&DI&(DL#$DO%(DO#+}%A7a<4A;.CCACyA~s#INIQJE & 	CS	Aq1uG  ))G,r?   weighted_freqc                 P    | d      }dgdz  }g }|j                         D ]  \  }}|||<   |s|j                  |         ||_        ||_        t	        |j                               |_        t        j                  t	        d |j                         D                    |_	        |S )aL  Create a BigramProfile from pre-computed weighted frequencies.

        Computes ``weight_sum`` and ``input_norm`` from *weighted_freq* to
        ensure consistency between the stored fields.

        :param weighted_freq: Mapping of bigram index to weighted count.
        :returns: A new :class:`BigramProfile` instance.
        r?   r   r   c              3   &   K   | ]	  }||z    y wNrB   ).0rr   s     r-   	<genexpr>z3BigramProfile.from_weighted_freq.<locals>.<genexpr>  s     *Q:PQ1q5:Ps   )
rF   r   rc   re   sumvaluesrf   ri   rj   rd   )clsrt   profilerc   re   ro   counts          r-   from_weighted_freqz BigramProfile.from_weighted_freq  s     c(#+'--/JCDIs# 0 ! !5!5!78!YYs*Q-:N:N:P*Q'QRr?   )__name__
__module____qualname____doc__	__slots__bytesrs   classmethoddictintr   rB   r?   r-   rb   rb      sM    
 @I&-U &-t &-P tCH~ /  r?   rb   r}   rL   	model_keyc                 \   | j                   dk(  ryt               }|r|j                  |      nd}|7d}t        d      D ]  }||   }|s|||z  z  } t	        j
                  |      }|dk(  ryd}| j                  }	| j                  D ]  }
|||
   |	|
   z  z  } ||| j                   z  z  S )zSScore a pre-computed bigram profile against a single model using cosine similarity.rh   Nr   r   )rd   r\   rV   r   ri   rj   rc   re   )r}   rL   r   r!   
model_normsq_sumr*   rr   dotrc   ro   s              r-   score_with_profiler     s     S E)29%JuAaA!a%  YYv&
S
C<<DuSzDI%% *w11122r?   c                     | s|yt               }|j                  |      }|y|t        |       }d}d}|D ]  \  }}}	t        |||	      }
|
|kD  s|
}|} ||fS )a  Score data against all language variants of an encoding.

    Returns (best_score, best_language). Uses a pre-grouped index for O(L)
    lookup where L is the number of language variants for the encoding.

    If *profile* is provided, it is reused instead of recomputing the bigram
    frequency distribution from *data*.

    :param data: The raw byte data to score.
    :param encoding: The canonical encoding name to match against.
    :param profile: Optional pre-computed :class:`BigramProfile` to reuse.
    :returns: A ``(score, language)`` tuple with the best cosine-similarity
        score and the corresponding language code (or ``None``).
    N)rh   Nrh   )rS   rV   rb   r   )r   rT   r}   rJ   variants
best_score	best_langrM   rL   r   ss              r-   score_best_languager   6  s    & GOOEyy"H%J I"*eYwy9z>JI	 #+ y  r?   ) rw   )-r   	functoolsimportlib.resourcesr4   ri   r   r9   r   chardet.registryr   r   Structunpack_fromr   r   r   r   r   str__annotations__r{   _encr   	languagesr$   r   tupler   floatr.   cacher>   rC   rI   rQ   rS   rX   boolrZ   r\   r_   r`   rb   r   r   rB   r?   r-   <module>r      s         6t$00&--%11	 $& $sCx. %HOOD
4>>a&*nnQ&7# 
A
A
4Z $sEz"223AH #5c:o!6S%Z8H!HI # #("T#z/* "j!	#tE#*j#567
78. +tCeC$J
C,G&H!IIJ + +
*S *S4Z *' ' '"$sEz* "
   0L L` MO33#,z#93FI3
36 %)&!
&!&! T!&! 5#*	&!r?   