
    Q3j?                    B(   d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZmZmZ d dlmZ d Zd	 Zej,                  j/                  d
g d      d        Zej,                  j/                  d
g d      d        Zej,                  j/                  dej4                  ej6                  ej8                  g      ej,                  j/                  dej4                  ej6                  ej8                  g      d               Zej,                  j/                  dej4                  ej6                  ej8                  g      d        Zd Zd Z d Z!d Z"d Z#ej,                  j/                  dg dg dg ejH                  g dg dg       ejH                  g dg dge%       ejH                  g dd ejL                  d!gge%       ejH                  g dd  e'd"      d!gge%       ejH                  g d#g d$ge%       ejH                  g d%d ejL                  dgge%       ejH                  g d%d  e'd"      dgge%      gg d&'      d(        Z(ej,                  j/                  d
g d      ej,                  j/                  d)d*d+g      ej,                  j/                  d,dd-g      d.                      Z)ej,                  j/                  d)d*d+g      ej,                  j/                  d/d0d1gd2d1gd0d1ggg d3g d4g d3gfd5d gd6d gd7d8gd6d ggg d9g d:g d;gfg      d<               Z*d= Z+ej,                  j/                  d,g d>      ej,                  j/                  d?g d>      d@               Z,ej,                  j/                  dAdBdCg      ej,                  j/                  dd2d0g ejH                  dDdEg      g      dF               Z-ej,                  j/                  dAdBdCg      dG        Z.ej,                  j/                  dHdId1gdJd1ggdIdJgd1ggej^                  f ejH                  d2d0gdKd0gg      d2dKgd0ggej`                  f ejH                  dLd!gdMd!gge%      dLdMgd!ggej^                  f ejH                  dLd!gdMd!gg      dLdMgd!ggejb                  f ejH                  d2d0gejL                  d0gg      d2ejL                  gd0ggej8                  f ejH                  dLejL                  gdejL                  gge%      dLdgejL                  ggej^                  f ejH                  dL e'd"      gd e'd"      gge%      dLdg e'd"      ggej^                  fgg dN'      dO        Z2ej,                  j/                  d
g d      ej,                  j/                  dP ejH                  d d8gge%      jf                   ejH                  d dQgge%      jf                  g dRgej^                  f ejH                  d2d0ggdS      jf                   ejH                  d2dTggdS      jf                  g dUgejh                  f ejH                  d d8gge%      jf                   ejH                  d dQgge%      jf                   ejH                  g dR      gej^                  f ejH                  dd gge%      jf                   ejH                  dd8gge%      jf                  g dVge%f ejH                  d d8gge%      jf                   ejH                  d ejL                  gge%      jf                  g dWge%f ejH                  d dgge%      jf                   ejH                  d ejL                  gge%      jf                  g dXge%fgg dY'      dZ               Z5d[ Z6ej,                  j/                  d\e
eg      d]        Z7d^ Z8d_ Z9ej,                  j/                  d`d-dadbgfdcg ddfg dedfdggfgg dh'      di        Z:dj Z;ej,                  j/                  dg dg dg ejH                  g dkg dlg       ejH                  g dg dge%      gg dm'      dn        Z<ej,                  j/                  dP ejH                  d d8gge%      jf                   ejH                  d dQgge%      jf                  g dRgej^                  f ejH                  d2d0ggdS      jf                   ejH                  d2dTggdS      jf                  g dUgejh                  f ejH                  d d8gge%      jf                   ejH                  d dQgge%      jf                   ejH                  g dR      gej^                  fgg do'      dp        Z=dq Z>dr Z?ej,                  j/                  dse'e@g      dt        ZAdu ZBdv ZCdw ZDdx ZEdy ZFdz ZGej,                  j/                  d,dcd-g      d{        ZHej,                  j/                  d|ejL                  d e'd"      g      d}        ZIej,                  j/                  d,dIdKgg d~g      d        ZJej,                  j/                  dd+d*gddg'      ej,                  j/                  d,d-g dgd-dg'      d               ZKej,                  j/                  d\e
eg      d        ZLej,                  j/                  ddd0iddiddid0dddTddg      ej,                  j/                  ddg dgg      d               ZMej,                  j/                  d,dcd-d8gg      d        ZNej,                  j/                  d,d gdQgg      d        ZOej,                  j/                  dddKiddiddiddiddidKdddTddg      d        ZPej,                  j/                  d,d-d8gg      d        ZQej,                  j/                  d,d gdQgg      d        ZRd ZSej,                  j/                  ddKd2dddTig      d        ZTd ZUd ZVd ZWd ZXd ZYej,                  j/                  ddd2dg      d        ZZej,                  j/                  dd0dKdg      d        Z[ej,                  j/                  dg d      ej,                  j/                  dg d      d               Z\d Z]ej,                  j/                  d|ejL                  dg      d        Z^d Z_ej,                  j/                  d
g d      ej,                  j/                  dddg      d               Z`ej,                  j/                  d
g d      d        Zaej,                  j/                  d
g d      d        Zbej,                  j/                  d
g d      d        Zcd Zdd Zeej,                  j/                  dejL                  dg      d        Zfej,                  j/                  dddg      ej,                  j/                  dejL                  dg      d               Zgej,                  j/                  dP ejH                  d ejL                  gge%      jf                   ejH                  d d8gge%      jf                   ejH                  d dQejL                  ge%      gej^                  f ejH                  d ejL                  gge%      jf                   ejH                  d d8gge%      jf                   ejH                  d dQejL                  ge%      gej^                  f ejH                  dejL                  ggej8                        jf                   ejH                  dDggej8                        jf                   ejH                  ddEejL                  g      gej8                  fgg d'      d        Zhej,                  j/                  d\e
eg      d        Ziej,                  j/                  d ejH                  dejL                  dDgg      jf                   ejH                  dejL                  dgg      jf                   ejH                  dEgg      f ejH                  g dâg      jf                   ejH                  g dĢg      jf                   ejH                  ejL                  gg      f ejH                  dejL                  d8gge%      jf                   ejH                  dejL                  dgg      jf                   ejH                  dQgge%      f ejH                  g dƢge%      jf                   ejH                  g dǢg      jf                   ejH                  ejL                  gge%      fg      dȄ        Zjej,                  j/                  de      dʄ        Zkd˄ Zlej,                  j/                  dddMgg ejH                  ddMggdά       ejH                  ddMggdϬ      g      ej,                  j/                  ddLdMgg ejH                  dLdMggdά       ejH                  dLdMggdϬ      g      dф               Zmd҄ Zndӄ ZodԄ Zpej,                  j/                  dd+d*g      dք        Zqej,                  j/                  d ejH                  d gdgge%      d gejL                  gejL                  gg ej                  dgdgdgge%      f ejH                  ejL                  gdgd gge%      d gejL                  gejL                  gg ej                  dgejL                  gejL                  gge%      fg      dل        Zsdڄ Ztdۄ Zud܄ Zvd݄ Zwdބ Zxej,                  j/                  dddKiddiddiddiddidKdddTddg      d߄        Zyd Zzd Z{d Z|d Z}ej,                  j/                  dddidd0ig      d        Z~ej,                  j/                  ddd2iddig      d        Zd Zd Zej,                  j/                  d\e
eg      d        Zd Zy)    N)sparse)NotFittedError)OneHotEncoderOrdinalEncoder)is_scalar_nan)_convert_containerassert_allcloseassert_array_equal)CSR_CONTAINERSc                     t        j                  g dg dg      } t               }t        d      }|j                  |       }|j                  |       }|j                  dk(  sJ |j                  dk(  sJ t        j                  |      sJ t        j                  |      rJ t        |j                         g dg dg       t        |j                         |       y )N         r   r   r   Fsparse_outputr      )              ?r   r   r   )r   r   r   r   r   )	nparrayr   fit_transformshaper   issparser
   toarray)X
enc_sparse	enc_denseX_trans_sparseX_trans_denses        V/DATA/.local/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_encoders.py!test_one_hot_encoder_sparse_denser$      s     	)Y'(AJE2I--a0N++A.M6)))&(((??>***}---  #<>W"X ~--/?    c                  :   t        j                  g dg dg      } t        d      }|j                  |       }|j                  dk(  sJ |j
                  j                  t         j                  k(  sJ |j                  j                  t         j                  k(  sJ y )Nr   r   Tr   csr)	r   r   r   r   formatindicesdtypeint32indptr)r   r   r!   s      r#   -test_one_hot_encoder_sparse_index_array_int32r-   *   s    
)Y'(AT2J--a0N  E)))!!''288333  &&"((222r%   handle_unknown)ignoreinfrequent_if_existwarnc                    t        j                  g dg dg dg      }t        j                  g dg      }t        d      }|j                  |       t	        j
                  t        d      5  |j                  |       d d d        t        |       }|j                  |       |j                         }t        |j                  |      j                         t        j                  g d	g             t        ||       y # 1 sw Y   xY w)
N)r   r   r   )r   r   r   )r   r   r   )   r   r   errorr.   Found unknown categoriesmatch)r   r   r   r   r   r   r   )r   r   r   fitpytestraises
ValueError	transformcopyr
   r   r	   r.   r   X2oh	X2_passeds        r#   #test_one_hot_encoder_handle_unknownrC   4   s    
)Y	23A	9+	B 
g	.BFF1I	z)C	D
R 
E 
n	5BFF1I	I
Y'')
567
 B	" 
E	Ds   /DDc                    t        j                  g d      j                  d      }t        j                  ddg      j                  d      }t        |       }|j	                  |       |j                         }t        |j                  |      j                         t        j                  g dg dg             t        ||       y )N)11111111223334444)r   55555rF   r5   )r   r   r   r   r   r   r   r   )	r   r   reshaper   r9   r>   r
   r=   r   r?   s        r#   +test_one_hot_encoder_handle_unknown_stringsrM   L   s    
23;;GDA	7D/	"	*	*7	3B
 
n	5BFF1I	I
Y'')
&(<=>
 r9%r%   output_dtypeinput_dtypec                    t        j                  ddgg|       j                  }t        j                  ddgddgg|      }t        d|      }t	        |j                  |      j                         |       t	        |j                  |      j                  |      j                         |       t        d|d      }t	        |j                  |      |       t	        |j                  |      j                  |      |       y )Nr   r   r*   auto)
categoriesr*   F)rS   r*   r   )	r   asarrayTr   r
   r   r   r9   r=   )rO   rN   r   
X_expectedrA   s        r#   test_one_hot_encoder_dtyperW   _   s     	

QF8;/11AaVaV,LAJ	&	=Br''*224jArvvay**1-557D	&E	RBr''*J7rvvay**1-z:r%   c                    t        j                  d      }|j                  ddgddgd      }t        j                  g dg dg| 	      }t        | 	      }t        |j                  |      j                         |       t        |j                  |      j                  |      j                         |       t        | d
      }t        |j                  |      |       t        |j                  |      j                  |      |       y )Npandasabr   r   ABr   r   r   r   r   r   r   r   rQ   F)r*   r   )r:   importorskip	DataFramer   r   r   r
   r   r   r9   r=   )rN   pdX_dfrV   rA   s        r#   !test_one_hot_encoder_dtype_pandasre   n   s    			X	&B<<sCj1v67D<6lKJ	\	*Br''-557Drvvd|--d3;;=zJ	\	?Br''-z:rvvd|--d3Z@r%   c                  Z   t               } g dg dg dg dg}| j                  |       | j                         }t        g d|       | j                  g d      }t        g d|       t	        j
                  t        d	      5  | j                  d
dg       d d d        y # 1 sw Y   y xY w)N)Maler   girlr   r   )Female)   rh   r   
   )rg   3   boy   r   )rg   [   rh         )	x0_Femalex0_Malex1_1x1_41x1_51x1_91x2_boyx2_girlx3_1x3_2x3_12x3_21x4_3x4_10x4_30)onetwothreefourfive)
one_Femaleone_Maletwo_1two_41two_51two_91	three_boy
three_girlfour_1four_2four_12four_21five_3five_10five_30z!input_features should have lengthr7   r   r   )r   r9   get_feature_names_outr
   r:   r;   r<   )encr   feature_namesfeature_names2s       r#   "test_one_hot_encoder_feature_namesr   ~   s    
/C!%"$		A GGAJ--/M	
" 	%* ../VWN	
" 	%* 
z)L	M!!5%.1 
N	M	Ms   B!!B*c                     t               } t        j                  ddggt              j                  }| j                  |       | j                         }t        ddg|       | j                  dg      }t        dd	g|       y )
Nu   c❤t1dat2rQ   u	   x0_c❤t1x0_dat2u   n👍meinput_featuresu   n👍me_c❤t1u   n👍me_dat2)r   r   r   objectrU   r9   r   r
   )r   r   r   s      r#   *test_one_hot_encoder_feature_names_unicoder      st    
/C
8V$%V466AGGAJ--/MY/?--i[-IM(.9=Ir%   c                     d } t        |       }t        j                  ddggt              j                  }|j                  |       |j                         }t        ddg|       |j                  dg	      }t        d
dg|       d }t        |      j                  |      }d}t        j                  t        |      5  |j                          ddd       y# 1 sw Y   yxY w)z=Check the behaviour of `feature_name_combiner` as a callable.c                 $    | dz   t        |      z   S )N_)reprfeaturecategorys     r#   name_combinerzHtest_one_hot_encoder_custom_feature_name_combiner.<locals>.name_combiner   s    }tH~--r%   )feature_name_combinerNoneNrQ   z	x0_'None'x0_NonerZ   r   za_'None'a_Nonec                      y)Nr    r   s     r#   wrong_combinerzItest_one_hot_encoder_custom_feature_name_combiner.<locals>.wrong_combiner   s    r%   zMWhen `feature_name_combiner` is a callable, it should return a Python string.r7   )r   r   r   r   rU   r9   r   r
   r:   r;   	TypeError)r   r   r   r   r   err_msgs         r#   1test_one_hot_encoder_custom_feature_name_combinerr      s    . m
<C
64.!022AGGAJ--/MY/?--cU-CM
H-}= n
=
A
A!
DCW  
y	0!!# 
1	0	0s   CC&c                     t        j                  ddgg      j                  } t               }|j	                  g dg       |j                         d   g dgk(  sJ |j                  |       j                         j                  dk(  sJ |j	                  g dg       |j                  |       j                         j                  dk(  sJ y )	Nr   r   )r   r   r   r   rS   rS   )r   r3   )r   r   r   r   r3   r   )	r   r   rU   r   
set_params
get_paramsr   r   r   )r   rA   s     r#   test_one_hot_encoder_set_paramsr      s    
1a&A	BMMl^M,==?<(\N:::A&&(..&888MMo.M/A&&(..&888r%   c                    t        d      }|j                  |       }t        dd      }|j                  |       }t        |j                         |       t	        j
                  |      r|j                  dk(  sJ |j                         S )NrR   r   FrS   r   r'   )r   r   r	   r   r   r   r(   )r   r   Xtr1Xtr2s       r#   check_categorical_onehotr      sq    
6
*CQD
6
?CQDDLLND)??4 T[[E%999<<>r%   r   defr   7   abcr   r   )rk   r   r   )r   r   r   )r[   r]   cat)rZ   r^   r   rQ   )r[   r   r   rZ   r   nan)Nr   r   )rZ   r   r   )Nr   N)mixednumericr   z	mixed-nanzmixed-float-nanz
mixed-Nonezmixed-None-nanzmixed-None-float-nan)idsc                 \   t        t        j                  |       d d dgf         }t        |ddgddgg       t        t        j                  |       d d ddgf         }t        |g dg dg       t	        d      j                  |       }t        |j                         g dg dg       y )	Nr   r   )r   r   r   r   r   r   r   r   rR   r   )r   r   r   r   r   )r   r   r   r   r   )r   r   r   r	   r   r   r   )r   Xtrs     r#   test_one_hot_encoderr      s    0 #288A;q1#v#6
7CC1a&1a&)*
"288A;q1a&y#9
:CC,56
6
*
8
8
;CCKKMO_#EFr%   sparse_FTdropfirstc                     g dg dg dg}t        ||      }|j                  |      }t        j                  |t              }t        |j                  |      |       ddgddgd	dgg}t        |d
|      }|j                  |      }t        j                  |      }t        |j                  |      |       |g dg dg dg}t        || ddgddgg dg      }|j                  |      }t        j                  |t              }d |d<   t        |j                  |      |       ddgddgd	dgg}t        |ddgddgg|       }|j                  |      }t        j                  |t              }d |d<   d |d d df<   t        |j                  |      |       t        j                  g dg dg      }t        j                  d      }t        j                  t        |      5  |j                  |       d d d        y # 1 sw Y   y xY w)Nr   r   )r   r   r   r   r   rQ   r   r   r   r   rR   )r   rS   r   r   r   )6   r   8   )r   r.   rS   )r   r   r   r   )r   rS   r.   r   r   r   r   r   r   )Shape of the passed X data is not correctr7   )r   r   r   r   r   r
   inverse_transformreescaper:   r;   r<   )r.   r   r   r   r   X_trexpmsgs           r#   test_one_hot_encoder_inverser     s    
8A
gD
9CQD
((1F
#Cs,,T2C8
R1b'Ar7#A
g&t
LCQD
((1+Cs,,T2C8| ^^<!)A=

   #hhq'D	3006< Wq"g2w'!AR))

   #hhq'D	AqD	3006< 88Y	*+D
))?
@C	z	-d# 
.	-	-s   )HHz
X, X_transr   r   r   r   r   r   r   r   r   r   r[   r   r   r   r   r   )r   r   r   r   r   )r   r   r   r   r   c                     t        |      j                  |       }d}|rt        |d      }t        j                  t
        |      5  |j                  |       ddd       y# 1 sw Y   yxY w)zCheck that `inverse_transform` raise an error with unknown samples, no
    dropped feature, and `handle_unknown="error`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/14934
    r   zqSamples \[(\d )*\d\] can not be inverted when drop=None and handle_unknown='error' because they contain all zerosr   r7   N)r   r9   r   r:   r;   r<   r   )r   X_transr   r   r   s        r#   ?test_one_hot_encoder_inverse_transform_raise_error_with_unknownr   K  s]    & g
.
2
21
5C	A 
 $Wh7	z	-g& 
.	-	-   A""A+c                      t        j                  ddgddgddggt              } t        dd	      }|j	                  |       }t        |j                  |      |        y )
Nrg   r   ri   r   r   rQ   	if_binaryFr   r   )r   r   r   r   r   r
   r   )r   oher   s      r#   &test_one_hot_encoder_inverse_if_binaryr   k  sV    
61+!}xm<FKA
[
>CQDs,,T2A6r%   )r   r   N
reset_dropc                    t        j                  ddgddgddggt              }t        | d      }|j	                  |       |j                  |      }|j                         }|j                  |	       t        |j                  |      |       t        |j                  |      |       t        |j                         |       y )
Nrg   r   ri   r   r   rQ   Fr   r   )r   r   r   r   r9   r=   r   r   r
   r   r	   )r   r   r   r   r   r   s         r#   test_one_hot_encoder_drop_resetr   r  s     	61+!}xm<FKA
T
7CGGAJ==D--/MNN
N#s,,T2A6CMM!$d+s002MBr%   methodr9   r         @      @c                     t               }d}t        j                  t        |      5   t	        ||      |        d d d        y # 1 sw Y   y xY w)Nz'Expected 2D array, got 1D array insteadr7   )r   r:   r;   r<   getattr)r   r   rA   r   s       r#   test_X_is_not_1Dr     s;     
B
3C	z	-FA 
.	-	-s   AAc                 
   t        j                  d      }|j                  g d      }t               }dt	        |       d}t        j
                  t        |      5   t        ||       |       d d d        y # 1 sw Y   y xY w)NrY   )   r   r3   r   z+Expected a 2-dimensional container but got z	 instead.r7   )r:   ra   Seriesr   typer;   r<   r   )r   rc   r   rA   r   s        r#   test_X_is_not_1D_pandasr     sd    			X	&B
		,A	B7Qy	
JC	z	-FA 
.	-	-s   A99BzX, cat_exp, cat_dtyper   r   r   r]   r^   )r   r   r   stringzmissing-floatzmissing-np.nan-objectzmissing-float-nan-objectc                    | | d d d   fD ]  }t        d      }|j                  |       t        |j                  t              sJ t        |j                  |      D ]w  \  }}|j                         }t        |d         rt        |d         sJ |d d |d d k(  sJ |j                         |k(  sJ t        j                  |j                  |      rwJ   y )NrI   rR   r   )r   r9   
isinstancecategories_listziptolistr   r   
issubdtyper*   )r   cat_exp	cat_dtypeXir   resr   res_lists           r#   test_one_hot_encoder_categoriesr    s    F !DbD'lv.#//4000COOW5HCzz|HSW%$Xb\222}CR000zz|s***==I666 6 r%   zX, X2, cats, cat_dtypedrZ   r[   cint64r3   r   r   r   )NrZ   z)rZ   r[   r	  )rZ   Nr	  )r   r   zobject-stringzobject-string-nonezobject-string-nanzobject-None-and-nanc                    t        |      }t        j                  g dg dg      }t        |j	                  |       j                         |       t        |j                  d         t        |d         k(  sJ |j                  d   j                         t        |d         k(  sJ |j                  d   j                  |k(  sJ t        |      }t        j                  t        d      5  |j                  |       d d d        t        ||      }t        j                  g dg dg      }t        |j                  |      j                  |      j                         |       y # 1 sw Y   jxY w)	Nr   r   r   r   r   r   r   r   r6   r7   rS   r.   )r   r   r   )r   r   r   r
   r   r   r   rS   r   r   r*   r:   r;   r<   r9   r=   )r   r@   catsr   r.   r   r   s          r#   )test_one_hot_encoder_specified_categoriesr    s&   f 4
(C
((O_5
6Cs((+335s;q!"d47m333??1$$&$tAw-777 ??1##y000 4
(C	z)C	D 
E
4
GC
((O_5
6Cswwr{,,R088:C@	 
E	Ds   -E((E1c                     t        j                  ddggt              j                  } t	        g dg      }t        j                  g dg dg      }t        |j                  |       j                  |       j                         |       t        |j                  |       j                         |       |j                  d   j                         g dk(  sJ t        j                  |j                  d   j                  t         j                        sJ t        j                  d	d
gg      j                  } t	        g dg      }d}t        j                   t"        |      5  |j                  |        d d d        y # 1 sw Y   y xY w)NrZ   r[   rQ   )r[   rZ   r  r   r  r  r   r   r   )r   r   r   z%Unsorted categories are not supportedr7   )r   r   r   rU   r   r
   r9   r=   r   r   r   r   r   r*   object_r:   r;   r<   )r   r   r   r   s       r#   (test_one_hot_encoder_unsorted_categoriesr    s   
3*V,..A
O#4
5C
((O_5
6Cswwqz++A.668#>s((+335s;??1$$&/999==+112::>>> 	1a&A
I;
/C
1C	z	-! 
.	-	-s   #E>>FEncoderc                 6   t        j                  dt         j                  dg      g} | |      }t        j                  ddggt              j                  }t        j                  t        d      5  |j                  |       ddd       y# 1 sw Y   yxY w)zTest encoder for specified categories that nan is at the end.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/27088
    r   r   r   rQ   zNan should be the last elementr7   N)	r   r   r   r   rU   r:   r;   r<   r9   r  r  r   r   s       r#   ,test_encoder_nan_ending_specified_categoriesr     sl     HHa^$%D
T
"C
1a&(**A	z)I	J
 
K	J	Js   4BBc                     t        j                  ddgddggt              j                  } t	        g dg dg      }t        j                  g d	g d
g      }t        |j                  |       j                         |       |j                  d   j                         g dk(  sJ t        j                  |j                  d   j                  t         j                        sJ |j                  d   j                         g dk(  sJ t        j                  |j                  d   j                  t         j                        sJ y )NrZ   r[   r   r   rQ   r  )r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   rU   r   r
   r   r   r   r   r   r*   r  r   r   r   s      r#   7test_one_hot_encoder_specified_categories_mixed_columnsr  .  s    
3*q!f%V466A
OY#?
@C
((24RS
TCs((+335s;??1$$&/999==+112::>>>??1$$&)333==+112::>>>r%   c                      t        j                  d      } | j                  ddgddgd      }t        |      }t	        |g dg dg       y )	NrY   rZ   r[   r   r   r\   r_   r`   )r:   ra   rb   r   r	   )rc   rd   r   s      r#   test_one_hot_encoder_pandasr  ;  sF    			X	&B<<sCj1v67D
"4
(CC,56r%   zdrop, expected_namesx0_cx2_br   )r  x1_2r  )r  r   r[   x0_bx2_a)r   binarymanualc                     g dg dg}t        |       }|j                  |       |j                         }t        ||       y )N)r  r   rZ   )r[   r   r[   r   )r   r9   r   r
   )r   expected_namesr   r   r   s        r#   'test_one_hot_encoder_feature_names_dropr%  D  s;     
&A
T
"CGGAJ--/M~}5r%   c                     ddgddgddgg} t        j                  g dg dg dg      }t        j                  d d	g      }t        d
d      }|j                  |       }t	        |j
                  |       t        ||       ddgddgddgg} t        j                  ddgddgddgg      }t        j                  d	d g      }t        d
d      }|j                  |       }t	        |j
                  |       t        ||       y )Nrk   yes   norq   )r   r   r   r   rK   )r   r   r   r   r   r   Fr   truerZ   falser   r   )r   r   r   r   r
   	drop_idx_r	   )r   expectedexpected_drop_idxr   results        r#   *test_one_hot_encoder_drop_equals_if_binaryr0  V  s   
er4j2u+.Axx	35IJH $+
[
>Cq!Fs}}&78FH% ###7Axx#sc3Z#s<=H!T+
[
>Cq!Fs}}&78FH%r%   )rk   r   r   )r(  r   r   )r   r   r   c                     t               }t        j                  g dg dgd      }t        |j	                  |       |j                  d             t        d      }t        |j	                  |       |       y )Nr   r   r   r   r   r   r  rQ   float64)r   r   r   r
   r   astyper  s      r#   test_ordinal_encoderr6  n  s^     
C
((Iy)
9Cs((+SZZ	-BC
w
'Cs((+S1r%   )r   r   zobject-string-catc                    t        |      }t        j                  dgdgg      }t        |j	                  |       |       t        |j                  d         t        |d         k(  sJ |j                  d   j                         t        |d         k(  sJ |j                  d   j                  |k(  sJ t        |      }t        j                  t        d      5  |j                  |       d d d        y # 1 sw Y   y xY w)Nr   r   r   r   r6   r7   )r   r   r   r
   r   r   rS   r   r   r*   r:   r;   r<   r9   )r   r@   r  r   r   r   s         r#   )test_ordinal_encoder_specified_categoriesr8    s    2 D
)C
((SEC5>
"Cs((+S1q!"d47m333??1$$&$tAw-777 ??1##y000 D
)C	z)C	D 
E	D	Ds   C88Dc                     g dg dg} t               }|j                  |       }t        j                  | t              }t        |j                  |      |       t        j                  g dg dg      }t        j                  d      }t        j                  t        |      5  |j                  |       d d d        y # 1 sw Y   y xY w)Nr   r   rQ   )r   r   r   r   r_   r   r7   )r   r   r   r   r   r
   r   r   r   r:   r;   r<   )r   r   r   r   r   s        r#   test_ordinal_encoder_inverser:    s    	(A

CQD
((1F
#Cs,,T2C8 88\<01D
))?
@C	z	-d# 
.	-	-s   %C  C	c                     t        dd      } t        j                  ddgddgdd	ggt        
      }t        j                  ddgddgddggt        
      }| j	                  |       | j                  |      }t        j                  ddgddgddggd
      }t        ||       | j                  |      }t        j                  dd gd dgddggt        
      }t        ||       y )Nuse_encoded_valuer.   unknown_valuerZ   xr[   yr  r	  rQ   xyblar   r   r   r  )r   r   r   r   r9   r=   r
   r   )r   X_fitr   X_trans_encr   X_trans_invinv_exps          r#   +test_ordinal_encoder_handle_unknowns_stringrH    s    
(;2
NCHHsCj3*sCj9HEhhdeS\C:>fMGGGEN--(K
((QGb!Wq!f-W
=C{C(''4KhhddC[3*=VLG{G,r%   r*   c                    t        dd      }t        j                  ddgddgdd	gg| 
      }t        j                  ddgddgddgg| 
      }|j                  |       |j	                  |      }t        j                  ddgddgddggd
      }t        ||       |j                  |      }t        j                  dd gd dgddggt        
      }t        ||       y )Nr<  r>  r      r      r   	   rQ   rn      r   r  )r   r   r   r9   r=   r
   r   r   )r*   r   rD  r   rE  r   rF  rG  s           r#   ,test_ordinal_encoder_handle_unknowns_numericrO    s    
(;4
PCHHq!fq!fq!f-U;EhhB"a1a&1?GGGEN--(K
((QIay1a&1
AC{C(''4KhhD	D!9q!f5VDG{G,r%   c                      t        dt        j                        } t        j                  dgdgdgg      }| j	                  |       | j                  dgdgdgg      }t        |dgdgt        j                  gg       y )Nr<  r>  r   r   r   r3   r   )r   r   r   r   r9   r=   r
   )r   rD  r   s      r#   (test_ordinal_encoder_handle_unknowns_nanrQ    so     (;266
RCHHqcA3_%EGGENmmaS1#sO,Gw!qcBFF8 45r%   c                      t        dt        j                  t              } t        j                  dgdgdgg      }t        j                  t        d      5  | j                  |       d d d        y # 1 sw Y   y xY w)Nr<  )r.   r?  r*   r   r   r   z'dtype parameter should be a float dtyper7   )	r   r   r   intr   r:   r;   r<   r9   )r   rD  s     r#   8test_ordinal_encoder_handle_unknowns_nan_non_float_dtyperT    s\     *"&&C HHqcA3_%E	z)R	S 
T	S	Ss   A22A;c                      t        j                  g dgt              j                  } g d}t	        |      }d}t        j                  t        |      5  |j                  |        d d d        y # 1 sw Y   y xY w)N)LowMediumHighrW  rV  rQ   )rV  rW  rX  r   z*Shape mismatch: if categories is an array,r7   )	r   r   r   rU   r   r:   r;   r<   r9   )r   r  r   r   s       r#   +test_ordinal_encoder_raise_categories_shaperY    sU    
<=VLNNA$D
D
)C
6C	z	-
 
.	-	-s   A11A:c            	         t        d      } t        j                  g dg dgd      }t        j                  ddgd	d
ggd      t        j                  ddgd	d
ggd      t        j                  ddgddgg      t        j                  ddgddgg      t        j                  ddgd	dggd      fD ]  }| j                  |       t	        t        d      D cg c](  }| j                  |   j                  |j                  k(  * c}      sJ t        | j                  |      j                         |        ddgd	d
gg}| j                  |       t	        t        d      D cg c]=  }t        j                  | j                  |   j                  t        j                        ? c}      sJ t        | j                  |      j                         |       ddgd	dgg}| j                  |       t	        t        d      D cg c]  }| j                  |   j                  dk(    c}      sJ t        | j                  |      j                         |       y c c}w c c}w c c}w )NrR   r   )r   r   r   r   )r   r   r   r   r4  rQ   r   r   r   r3   r  rZ   r[   r  r     a   b   c   dr   )r   r   r   r9   allranger   r*   r
   r=   r   r   integer)r   r   r   is       r#   test_encoder_dtypesrc    s   
6
*C
(((*>?y
QC 	1a&1a&!1
1a&1a&!3
3*sCj)*
4,t-.
1c(QH%X6 	
qJACOOA&,,7JKKK3==+335s; Q!QAGGAJUSTXVXcooa066

CXVWWWs}}Q'//137
SAs8AGGAJeAhGh"((H4hGHHHs}}Q'//137 K
 W
 Hs   -I
&AI #I%c                     t        j                  d      } t        d      }t        j                  g dg dgd      }| j                  dd	gd
dgddgdd      }|j                  |       t        t        d	      D cg c]  }|j                  |   j                  dk(    c}      sJ t        |j                  |      j                         |       | j                  dd	gddgddgd      }g d}|j                  |       t        t        d
      D cg c]!  }|j                  |   j                  ||   k(  # c}      sJ t        |j                  |      j                         |       y c c}w c c}w )NrY   rR   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r4  rQ   r   r   r   r3   r   r   r]   r^   Cr  rZ   r[   r   r   )r  r   r4  )r:   ra   r   r   r   rb   r9   r_  r`  r   r*   r
   r=   r   )rc   r   r   r   rb  expected_cat_types         r#   test_encoder_dtypes_pandasrh    sO   			X	&B
6
*C
((	')GHC
 	Aq6AaV<GLAGGAJU1XFX"((G3XFGGGs}}Q'//137
Aq6c
#sDEA6GGAJ%PQ(S(Q"((,=a,@@(STTTs}}Q'//137 G Ts   ?#E/&E4c                      t               } ddgddgg}t        j                         5  t        j                  d       | j	                  |       d d d        y # 1 sw Y   y xY w)Nrg   r   ri   r   r4   )r   warningscatch_warningssimplefilterr   )r   r   s     r#   test_one_hot_encoder_warningrm  '  sO    
/C
!xm$A		 	 	"g&! 
#	"	"s   'AA c                 >   ddgddgddgg}t        | ddddgddgg      }|j                  |       d	dgg}t        j                  ddgg      }d
}t	        j
                  t        |      5  |j                  |      }ddd       t        |       y# 1 sw Y   xY w)z,Check handle_unknown='warn' works correctly.rZ   r   r[   r   r   Fr1   r   r   r.   rS   r  Found unknown categories in columns \[0\] during transform. These unknown categories will be encoded as the infrequent category.r7   N	r   r9   r   r   r:   warnsUserWarningr=   r	   )r   r   r   X_testrV   warn_msgr   s          r#   test_ohe_handle_unknown_warnrv  /  s     qC8c1X&A
#JA'	C GGAJAhZFAq6(#J	  
 
k	2--' 
3GZ( 
3	2s   ,BBmissing_valuec                    dddd| g}t        |      }g dg ddddd| gg}|j                  |      j                         }g dg d	g d
g}t        ||       |j                  |u sJ t        |j                  |j                        D cg c]
  \  }}||    }}}|j                  |      }	t        j                  |t              }
t        |d         rt        |d d |d d        t        |d         sJ t        |d         sJ t        |
d d d df   |	d d d df          t        |
dd df   |	dd df          t        |
d         sJ t        |	d         sJ y t        ||       t        |
|	       y c c}}w )Nr   rn   r   r   r   )r   rn   r   r   rZ   )r   rn   r   r   rZ   )r   r   r   r   r   )r   r   r   r   r   r   rQ   rI   )rI   rI   )r   r   r   r
   r   r   r   r,  r   r   r   r   r   )rw  cats_to_dropr   r   transr   r   r   dropped_catsX_inv_transX_arrays              r#    test_one_hot_encoder_drop_manualr~  J  s   2q"m4L
\
*C	Ar=)	A
 a ((*EO_
=Cuc"88|### *-S__cmm)L)LgG)L   ''.Khhq'G \"%&<,l3B.?@\"-...\"-...71crc6?K3B3,?@ 	72ss7+[SbS-ABWV_---[0111<67K0)s   E;)r   r   rj   rZ   c                     t        |       }d}t        j                  t        |      5  |j	                  g dg dg dg       d d d        y # 1 sw Y   y xY w)Nr   z-`drop` should have length equal to the numberr7   r   r   )r   r   ;   )r   r:   r;   r<   r9   )r   r   r   s      r#   test_invalid_drop_lengthr  o  s>    
T
"C=G	z	1@A 
2	1	1s   AAdensityr   denserZ   r   r[   r"  c                    t        |       }t        | |      }g dg dg}|j                  |       |j                  |       t        |j                  |j                         |dk(  rt        |j                  d       n=t        ||j                  |j                        D ]  \  }}}|t        |         |k(  rJ  t        |j                  t        j                        sJ |j                  j                  t        k(  sJ y )Nr   r   )r  r   rZ   r  r   r   )r   r9   r
   r   r,  r   rS  r   r   ndarrayr*   r   )r  r   ohe_baseohe_testr   drop_catdrop_idxcat_lists           r#   test_categoriesr  w  s     73H7>H	&ALLOLLOx++X-A-ABw8--q1,/($$h&:&:-
(Hh CM*h666-
 h(("**555##v---r%   c                 Z     |        j                         j                  j                  sJ y )N)__sklearn_tags__
input_tagscategorical)r  s    r#   "test_encoders_has_categorical_tagsr    s"    9%%'22>>>>r%   kwargsmax_categoriesmin_frequency   g(\?r   )r  r  rn   rS   rR   rZ   r[   r  r  c                 .   t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   g      j                  }t        d|d	d
d| j	                  |      }t        |j                  g dg       dgdgdgdgdgg}t        j                  ddgddgddgddgddgg      }|j                  |      }t        ||       dgdgdz  z   D cg c]  }|g }}|j                  |      }	t        ||	       |j                         }
t        ddg|
       yc c}w )zpTest that different parameters for combine 'a', 'c', and 'd' into
    the infrequent category works as expected.rZ   r   r[   r(  r  rk   r  r   r0   F)rS   r.   r   rZ   r  r  er   r   infrequent_sklearnr3   r  x0_infrequent_sklearnNr   r   r   rU   r   r9   r
   infrequent_categories_r=   r	   r   r   )r  rS   X_trainr   rt  r-  r   colexpected_invX_invr   s              r#   test_ohe_infrequent_two_levelsr    sL    hh	SEBJ.#;seaiGHIKKG
 , 	
 
c'l  s11O3DEecUSEC53%0Fxx!Q!Q!Q!Q!Q@AHmmF#GHg&&)U.B-Ca-G%GH%GcSE%GLH!!'*E|U+--/M 78-H Is   

Dc                    t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   g      j                  }t        d	d
d|       j	                  |      }|j
                  d   |j                  d      dk(  sJ t        j                  dgdgg      }|j                  |      }t        dgdgg|       |j                         }t        dg|       |j                  |      }t        dgdgg|       y)z3Test two levels and dropping the frequent category.rZ   r   r[   r(  r  rk   r  r   r0   Fr   r.   r   r  r   r   r   r  r  N)r   r   rU   r   r9   r   r,  r=   r	   r   r
   r   )r   r  r   rt  r   r   	X_inverses          r#   ,test_ohe_infrequent_two_levels_drop_frequentr    s    hh	SEBJ.#;seaiGHIKKG
,	
 
c'l  ??1cmmA./3666XXusen%FmmF#GaS1#J(--/M/0-@%%g.I 456	Br%   c                 (   t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   g      j                  }t        d	d
d|       }d| d   d}t	        j
                  t        |      5  |j                  |       ddd       y# 1 sw Y   yxY w)z_Test two levels and dropping any infrequent category removes the
    whole infrequent category.rZ   r   r[   r(  r  rk   r  r   r0   Fr   r  Unable to drop category r   ( from feature 0 because it is infrequentr7   Nr   r   rU   r   r:   r;   r<   r9   r   r  r   r   s       r#   5test_ohe_infrequent_two_levels_drop_infrequent_errorsr    s    
 hh	SEBJ.#;seaiGHIKKG
,	C %T!WK/W
XC	z	- 
.	-	-   -BBrM  gQ?g{Gz?rL  c                 
   t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   g      j                  }t        dd	d
d| j	                  |      }t        |j                  ddgg       dgdgdgdgdgg}t        j                  g dg dg dg dg dg      }|j                  |      }t        ||       dgdgdgdgdgg}|j                  |      }t        ||       |j                         }t        g d|       y)zkTest that different parameters for combing 'a', and 'd' into
    the infrequent category works as expected.rZ   r   r[   r(  r  rk   r  r   r0   Fr.   r   r  r3  r   r   r   r2  r  )r  r  r  Nr   r  )	r  r  r   rt  r-  r   r  r  r   s	            r#    test_ohe_infrequent_three_levelsr    s'     hh	SEBJ.#;seaiGHIKKG
 ,EEK	c'l  s11S#J<@ecUSEC53%0FxxIy)YOPHmmF#GHg& 
				L !!'*E|U+--/M@-Pr%   c                 $   t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   g      j                  }t        d	d
d|       j	                  |      }t        j                  dgdgdgg      }t        ddgddgddgg|j                  |             |j                  d      j	                  |       d}t        j                  t        |      5  |j                  dgdgg      }ddd       t        ddgddgg       y# 1 sw Y   xY w)z5Test three levels and dropping the frequent category.rZ   r   r[   r(  r  rk   r  r   r0   Fr  r   r   r/   r5   r6   r7   r  N)r   r   rU   r   r9   r	   r=   r   r:   rr  rs  )r   r  r   rt  r   r   s         r#   .test_ohe_infrequent_three_levels_drop_frequentr  
  s    hh	SEBJ.#;seaiGHIKKG
,	
 
c'l  XXusecU+,FaVaVaV,cmmF.CD NN(N+//8
$C	k	---#/ 
. aVaV$g. 
.	-s   DDc                 (   t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   g      j                  }t        d	d
d|       }d| d   d}t	        j
                  t        |      5  |j                  |       ddd       y# 1 sw Y   yxY w)z7Test three levels and dropping the infrequent category.rZ   r   r[   r(  r  rk   r  r   r0   Fr  r  r   r  r7   Nr  r  s       r#   7test_ohe_infrequent_three_levels_drop_infrequent_errorsr  "  s     hh	SEBJ.#;seaiGHIKKG
,	C %T!WK/W
XC	z	- 
.	-	-r  c                      t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   g      j                  } t        d	d
d      j	                  |       }t        |j                  ddgg       dgdgdgdgg}t        j                  g dg dg dg dg      }|j                  |      }t        ||       dgg}d}t        j                  t        |      5  |j                  |       ddd       y# 1 sw Y   yxY w)zmTest that different parameters for combining 'a', and 'd' into
    the infrequent category works as expected.rZ   r   r[   r(  r  rk   r  r   r4   F)r.   r   r  r3  r  r2  badz.Found unknown categories \['bad'\] in column 0r7   N)r   r   rU   r   r9   r
   r  r=   r	   r:   r;   r<   )r  r   rt  r-  r   r   s         r#   (test_ohe_infrequent_handle_unknown_errorr  2  s     hh	SEBJ.#;seaiGHIKKG
eA	c'l  s11S#J<@ ecUSEC5)FxxIy)DEHmmF#GHg& gYF
;C	z	-f 
.	-	-s   C44C=c                    t        j                  dgdz  dgdz  z   gt              j                  }t	        dg dgddd	| j                  |      }dgd
gdgdgdgg}t        j                  ddgddgddgddgddgg      }|j                  |      }t        ||       dddgg}dgdgg}|D ]B  }|j                  |      j                  |       t        dgdgg|j                  |             D y)zG'a' is the only frequent category, all other categories are infrequent.rZ   r   r  rq   rQ   r  r  rZ   r[   Fr0   rS   r   r.   r[   r  r  r   r   r   r   r   Nr   )	r   r   r   rU   r   r9   r=   r	   r   )r  r  r   rt  r-  r   dropsr   s           r#   5test_ohe_infrequent_two_levels_user_cats_one_frequentr  J  s    hh	SEBJ./v>@@G
 (), 	
 
c'l  ecUSEC53%0Fxx!Q!Q!Q!Q!Q@AHmmF#GHg& kC5)EecU^FD!%%g.!qc
CMM&$9: r%   c                     t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   gt        	      j                  } t	        g d
gddd      j                  |       }t        |j                  g dg       dgdgdgdgdgg}t        j                  ddgddgddgddgddgg      }|j                  |      }t        ||       dgdgdz  z   D cg c]  }|g }}|j                  |      }t        ||       yc c}w )zFTest that the order of the categories provided by a user is respected.rZ   r   r[   r(  r  rk   r  r   rQ   r  Fr0   r   rS   r   r.   r  )r  r  rZ   r  r   r   r  r3   Nr   r   r   rU   r   r9   r
   r  r=   r	   r   )r  r   rt  r-  r   r  r  r  s           r#   (test_ohe_infrequent_two_levels_user_catsr  f  s.   hh
cURZ	3%"*	,uqy	89a  (),	
 
c'l  s11O3DEecUSEC53%0Fxx!Q!Q!Q!Q!Q@AHmmF#GHg& '*U.B-Ca-G%GH%GcSE%GLH!!'*E|U+ Is   
C=c                     t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   gt        	      j                  } t	        g d
gddd      j                  |       }t        |j                  ddgg       dgdgdgdgdgg}t        j                  g dg dg dg dg dg      }|j                  |      }t        ||       dgdgdgdgdgg}|j                  |      }t        ||       y)zTest that the order of the categories provided by a user is respected.
    In this case 'c' is encoded as the first category and 'b' is encoded
    as the second one.rZ   r   r[   r(  r  rk   r  r   rQ   r  r  r[   rZ   Fr0   r  r  r2  r  r3  r  Nr  )r  r   rt  r-  r   r  r  s          r#   *test_ohe_infrequent_three_levels_user_catsr    s   
 hh
cURZ	3%"*	,uqy	89a  (),	
 
c'l  s11S#J<@ecUSEC53%0FxxIy)YOPHmmF#GHg&
 
				L !!'*E|U+r%   c                      t         j                  g dg df   } t        ddd      }|j                  |        ddgddgg}|j	                  |      }t        |g d	g d
g       y)zaTest infrequent categories where feature 0 has infrequent categories,
    and feature 1 does not.	r   r   r   r   r   r   r   r   r   	r   r   r   r   r   r   r   r   r   r   r   F)r  r   r   r   r   r   r   r   r   )r   r   r   r   N)r   c_r   r9   r=   r	   )r   r   rt  r   s       r#   test_ohe_infrequent_mixedr    sc     	)+FFGA
q{%
PCGGAJ!fq!fFmmF#G GlL9:r%   c            
      b   t         j                  g dg dg df   } t        ddd      }|j                  |       j	                         }t        |j                  d   d	d
g       t        |j                  d	   d	dg       t        |j                  d
   d       |j                         }t        g d|       g dg dg dg dg dg dg dg dg dg	}t        ||       g dg dg}|j                  |      }g dg dg}t        ||j	                                |j                  |      }t        j                  g dg dgt              }t        ||       t        ddd      j                  |       }t        j                  t         d      5  |j                  |       ddd       g d g d!g}|j                  |      }g d"g dg}t        ||j	                                |j                  |      }t        j                  g d#g d$gt              }t        ||       y# 1 sw Y   xY w)%z?Test infrequent categories with feature matrix with 3 features.r  )	r   r   r   r   r   rk   r   r   r   )	r   r   r   r   r   r   r   r   r   rR   r   r0   rS   r  r.   r   r   r   rk   N)x0_0x0_3r  x1_0x1_5x1_infrequent_sklearnx2_0x2_1)r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   )r3   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r  N)r  r   NrQ   r4   r6   r7   )r   r   r   )r   rk   r   )r   r   r   r   r   r   r   r   )r  r  r   )r   r  r   )r   r  r   r   r   r
   r  r   r	   r=   r   r   r   r9   r:   r;   r<   )	r   r   r   r   r-  rt  X_test_transr  r  s	            r#   'test_ohe_infrequent_multiple_categoriesr    s    	#$#	%	A !<QC "**,Gs11!4q!f=s11!4q"g>s11!4d;
 --/M		
 	 	!        
H Hg&#F==(L )*BCHHl2245!!,/E88	(*IJRXL |U+ !G	c!f  
z)C	Df 
E $F==(L(*BCHHl2245!!,/E88	8:VWL |U+! 
E	Ds   H%%H.c            
         t        j                  d      } | j                  g dg ddddg      }t        dd	d
      }|j	                  |      j                         }t        |j                  d   ddg       t        |j                  d   g d       g dg dg dg dg dg dg dg dg dg	}t        ||       | j                  ddgddgdddg      }g dg dg}|j                  |      }t        ||j                                |j                  |      }t        j                  ddgddggt              }t        ||       | j                  ddgddgdddg      }|j                  |      j                         }g dg dg}t        ||       |j                  |      }t        j                  ddgddggt              }t        ||       y)zHTest infrequent categories with a pandas dataframe with multiple dtypes.rY   	rZ   fr  r  r  rZ   r  r[   r[   	r   r   r   rk   rk   rn   r   r   r   )strrS  r  rS  columnsrR   r   r0   r  r   rZ   r[   r   r   r   rn   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r     rn   r  rQ   r  r   N)r:   ra   rb   r   r   r   r
   r  r	   r=   r   r   r   r   )	rc   r   r   r   r-  rt  r  r  r  s	            r#   .test_ohe_infrequent_multiple_categories_dtypesr    s    
		X	&B
@1	
  	 	A !<QC "**,Gs11!4sCjAs11!4jA 	
H Hg&\\3*b"X>PU\WF"$67H==(LHl2245!!,/E88
 4	5=Q7RSL |U+ \\3*b!W=u~\VF==(002L"$67HHl+!!,/E88
#	$';Q&?@L |U+r%   rp   )r  r  c                     t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   g      j                  }t        dd	d
d| }|j	                  |       |j                  dgg      }t        |dgg       y),All user provided categories are infrequent.rZ   r   r[   r(  r  rk   r  r   r0   Fr  r   Nr   )r   r   rU   r   r9   r=   r	   r  r  r   r   s       r#   $test_ohe_infrequent_one_level_errorsr  S  s     hh	SEBJ.#;seaiGHIKKG
 ,EEKC GGGmmcUG$GGqcU#r%   c                     t        j                  dgdz  gt              j                  }t	        dg dgddd| j                  |      }|j                  dgdgg      }t        |d	gd	gg       y
)r  r  r   rQ   r  Fr0   r  rZ   r   Nr   )r   r   r   rU   r   r9   r=   r	   r  s       r#   5test_ohe_infrequent_user_cats_unknown_training_errorsr  a  s     hh	{&133G
 (), 	
 
c'l  mmcUSEN+GGqcA3Z(r%   zinput_dtype, category_dtype)OOOUUOUUSOSUSS
array_type)r   r   rY   c                    t        j                  dgdgg|       }t        j                  ddg|      g}t        |d      j                  |      }t	        dgdgdgdgg||       }|j                  |      }t        j                  ddgddgddgddgg      }t        ||       t        |      j                  |      }	|	j                  |      }t        j                  dgdgdgdgg      }t        ||       y	)
a"  Check that encoding work with object, unicode, and byte string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/15616
    https://github.com/scikit-learn/scikit-learn/issues/15726
    https://github.com/scikit-learn/scikit-learn/issues/19677
    r[   rZ   rQ   Fr   r   r   r   N)	r   r   r   r9   r   r=   r	   r   r
   )
rO   category_dtyper  r   rS   r   rt  r   r-  oes
             r#   test_encoders_string_categoriesr  r  s     	3%#{3A((C:^<=J
:U
C
G
G
JC
use$jF mmF#Gxx!Q!Q!Q!Q89HGX&	:	.	2	21	5Bll6"Gxx!qcA3,-Hw)r%   c                  4   t        j                  dgdggd      } t        j                  ddgd      g}t        |d      }t        j                  d      }t        j                  t        |	      5  |j                  |        d
d
d
       y
# 1 sw Y   y
xY w)zCheck that this mixture of predefined categories and X raises an error.

    Categories defined as bytes can not easily be compared to data that is
    a string.
    r[   rZ   UrQ   SFr   zjIn column 0, the predefined categories have type 'bytes' which is incompatible with values of type 'str_'.r7   N)	r   r   r   r   r   r:   r;   r<   r9   )r   rS   r   r   s       r#   $test_mixed_string_bytes_categoricalsr    sy     	3%#s+A((C:S12J
:U
CC
))	'C
 
z	-
 
.	-	-s   3BBc                     t        j                  dd| d| ggt              j                  }t	        dd      j                  |      }|j                         }t        |ddd	|  g       y )
NrZ   r[   rQ   Fr/   r   r.   x0_ar  x0_)r   r   r   rU   r   r9   r   r
   )rw  r   r   namess       r#   )test_ohe_missing_values_get_feature_namesr    se     	3]C?@OQQA
eH
E
I
I!
LC%%'Euvv]O/DEFr%   c            	      (   t        j                  d      } | j                  g dt        j                  dddt        j
                  gt              ddd	g
      }t        j                  g dg dg dg dg      }t        |      }t        ||       y )NrY   )dogr   Nr   r   r   r3   rQ   )col1col2r  r  r  )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )	r:   ra   rb   r   r   r   floatr   r	   )rc   dfexpected_df_transr   s       r#   %test_ohe_missing_value_support_pandasr    s    			X	&B	/HHaArvv.e<	
   
 
B !!!!		
 #2
&CC*+r%   pd_nan_typepd.NAznp.nanc           
         t        j                  d      }| dk(  r|j                  nt        j                  }|j                  d|j                  dd|ddgd      i      }t        j                  g d	g d
g dg dg d
g      }t        d|      }|j                  |      }t        ||       t        |j                        dk(  sJ t        |j                  d   d d g d       t        j                  |j                  d   d         sJ y )NrY   r  r  r  rZ   r[   r   rQ   )r   r   r   r   )r   r   r   r   )r   r   r   r   r  Fr  r   r   rI   r  )r:   ra   NAr   r   rb   r   r   r   r   r	   lenr   r
   isnan)r  r.   rc   pd_missing_valuer  r  r   df_transs           r#   1test_ohe_missing_value_support_pandas_categoricalr    s     
		X	&B +w 6ruuBFF	BIIsC)93DJIW	

B
 	
 eN
KC  $H%x0s1$$$sq)#2.@88COOA&r*+++r%   c                    ddgddgddgg}t        dd|       }|j                  |      }t        j                  g d	g d
g dg      }t	        ||       ddgg}t        j                  g d	g      }| dk(  rd}nd}t        j                  t        |      5  |j                  |      }ddd       t	        ||       |j                  |      }t        |t        j                  ddggt                     y# 1 sw Y   OxY w)zZCheck drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
    during transform.rZ   r   r[   r   r   r   Fr   r   r.   r   r   )r   r   r   r  r   r/   tFound unknown categories in columns \[0, 1\] during transform. These unknown categories will be encoded as all zerosFound unknown categories in columns \[0, 1\] during transform. These unknown categories will be encoded as the infrequent category.r7   NrQ   r   r   r   r   r	   r:   rr  rs  r=   r   r
   r   r.   r   r   r   rV   rt  ru  r  s           r#   /test_ohe_drop_first_handle_unknown_ignore_warnsr    s    qC8c1X&A
E.C "G	
J GZ( AhZF9+&J! 	$ 	
 
k	2--' 
3GZ( !!*-Eubhhaz@A 
3	2   C77D c                    ddgddgddgg}t        dd|       }|j                  |      }t        j                  g d	g d
g dg      }t	        ||       ddgg}t        j                  g dg      }| dk(  rd}nd}t        j                  t        |      5  |j                  |      }ddd       t	        ||       |j                  |      }t        |t        j                  ddggt                     y# 1 sw Y   OxY w)zDCheck drop='if_binary' and handle_unknown='ignore' during transform.rZ   r   r[   r   r   r   Fr  r  r   r_   r  r   )r   r   r   r   r/   r  r  r7   NrQ   r  r  s           r#   3test_ohe_drop_if_binary_handle_unknown_ignore_warnsr    s    qC8c1X&A
nC "G	
J GZ( AhZF<.)J! 	$ 	
 
k	2--' 
3GZ( !!*-Eubhhd}FCD 
3	2r  c                 N   ddgddgddgg}t        dd| ddgddgg      }|j                  |       d	dgg}t        j                  ddgg      }| d
k(  rd}nd}t	        j
                  t        |      5  |j                  |      }ddd       t        |       y# 1 sw Y   xY w)znCheck drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
    during fit with categories passed in.rZ   r   r[   r   r   r   Fro  r  r/   zqFound unknown categories in columns \[0\] during transform. These unknown categories will be encoded as all zerosrp  r7   Nrq  )r.   r   r   rt  rV   ru  r   s          r#   'test_ohe_drop_first_explicit_categoriesr  ?  s    
 qC8c1X&A
%#JA'	C GGAJAhZFAq6(#J!E 	$ 	
 
k	2--' 
3GZ( 
3	2s   4BB$c                     t        j                  d      } | j                  g dg ddddg      }t        d	      }|j	                  d
       d}t        j
                  t        |      5  |j                  |       ddd       |j                  |       t        j
                  t        |      5  |j                  |       ddd       y# 1 sw Y   PxY w# 1 sw Y   yxY w)zJRaise informative error message when pandas output and sparse_output=True.rY   r  )r	  r[   r[   )rZ   r[   rZ   r[   r  Tr   r=   zxPandas output does not support sparse data. Set sparse_output=False to output pandas dataframes or disable Pandas outputr7   N)
r:   ra   rb   r   
set_outputr;   r<   r   r9   r=   )rc   r  r   r   s       r#   'test_ohe_more_informative_error_messager  a  s    			X	&B	IO<sCj	QB
d
+CNNXN&	S  
z	-" 
. GGBK	z	-b 
.	-	 
.	- 
.	-s   -C3CCC#c                  D   t        j                  t         j                  dddgg      j                  } t	        t         j
                        }dt         j
                   }t        j                  t        |      5  |j                  |        ddd       y# 1 sw Y   yxY w)zDTest ordinal encoder with nan passthrough fails when dtype=np.int32.r   r   rQ   zdThere are missing values in features \[0\]. For OrdinalEncoder to encode missing values with dtype: r7   N)
r   r   r   rU   r   r+   r:   r;   r<   r9   )r   r  r   s      r#   Btest_ordinal_encoder_passthrough_missing_values_float_errors_dtyper  u  su     	2663S)*+--A	bhh	'B	002z	;  
z	-
q	 
.	-	-s   ;BBencoded_missing_valuer=  c                    t        j                  t         j                  dddggt         j                        j                  }t        |       j                  |      }t        |j                        dk(  sJ t        |j                  d   ddt         j                  g       |j                  |      }t        || gdgdgdgg       |j                  |      }t        ||       y)	z.Test ordinal encoder with nan on float dtypes.r   r   rQ   r  r   r   r   N)r   r   r   r4  rU   r   r9   r  r   r	   r=   r   )r  r   r  r   r  s        r#   5test_ordinal_encoder_passthrough_missing_values_floatr"    s     	2663S)*"**=??A	.C	D	H	H	KBr~~!###BNN1%S"&&'9:ll1oGG45usecUKL$$W-IIq!r%   c           
         t        j                  d      }| dk(  r|j                  nt        j                  }|j                  d|j                  dd|ddgd      i      }t        |	      j                  |      }t        |j                        d
k(  sJ t        |j                  d   dd g d       t        j                  |j                  d   d         sJ |j                  |      }t        |dgdg|gdgdgg       |j                  |      }|j                   dk(  sJ t        |dddf   ddg       t        |dddf   ddg       t        j                  |d         sJ y)z0Check ordinal encoder is compatible with pandas.rY   r  r  r  rZ   r[   r   rQ   r!  r   r   Nr   r  rI          @r   r   )r   r   r   r   )r:   ra   r  r   r   rb   r   r   r9   r  r   r
   r	  r=   r	   r   r   )r  r  rc   r
  r  r  r  r  s           r#   =test_ordinal_encoder_missing_value_support_pandas_categoricalr%    s`    
		X	&B +w 6ruuBFF	BIIsC)93DJIW	

B 
.C	D	H	H	LBr~~!###r~~a(!,o>88BNN1%b)***||BHHuse.C-DsecUST$$X.I??f$$$y!Q'#s4yQ'#s488IdO$$$r%   r$  )zobject-None-missing-valuezobject-nan-missing_valueznumeric-missing-valuec                 v   t        |      }t        j                  dgt        j                  gg      }t	        |j                  |       |       |j                  d   j                  |k(  sJ t        |      }t        j                  t        d      5  |j                  |       ddd       y# 1 sw Y   yxY w)z.Test ordinal encoder for specified categories.r   r   r   r6   r7   N)r   r   r   r   r
   r   r   r*   r:   r;   r<   r9   )r   r@   r  r   r  r   s         r#   =test_ordinal_encoder_specified_categories_missing_passthroughr'    s    L 
4	(B
((SEBFF8$
%Cr''*C0 >>!""i/// 
4	(B	z)C	D
r
 
E	D	Ds   B//B8c                 $   t        j                  g dt              g} | |      }t        j                  ddggt              j                  }t	        j
                  t        d      5  |j                  |       ddd       y# 1 sw Y   yxY w)	zTest encoder for specified categories have duplicate values.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/27088
    )rZ   r[   rZ   rQ   r   rZ   r[   z5the predefined categories contain duplicate elements.r7   N)r   r   r   rU   r:   r;   r<   r9   r  s       r#   +test_encoder_duplicate_specified_categoriesr)    sl     HH_F34D
T
"C
3*V,..A	Q
 	

 
 
s   +BBzX, expected_X_trans, X_testr   r   )r   r   r   )r   r$  r   r  )r  rZ   r[   )r$  r   r   c                     t        dd      }|j                  |       }t        ||       t        |j                  |      dgg       y)z>Test the interaction between missing values and handle_unknownr<  rI   r>  g      N)r   r   r	   r=   )r   expected_X_transrt  r  r   s        r#   /test_ordinal_encoder_handle_missing_and_unknownr,    sC    8 
':"	MBq!GG-.BLL(D6(3r%   csr_containerc                    t        j                  g dg dg      } | |      }t               }d}t        j                  t
        |      5  |j                  |       ddd       t        j                  t
        |      5  |j                  |       ddd       |j                  |      } | |      }t        j                  t
        |      5  |j                  |       ddd       y# 1 sw Y   xY w# 1 sw Y   dxY w# 1 sw Y   yxY w)zCheck that we raise proper error with sparse input in OrdinalEncoder.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19878
    r   r   z2Sparse data was passed, but dense data is requiredr7   N)	r   r   r   r:   r;   r   r9   r   r   )r-  r   X_sparseencoderr   r   r!   s          r#   test_ordinal_encoder_sparser1    s     	)Y'(AQHGBG	y	0H 
1	y	0h' 
1 ##A&G"7+N	y	0!!.1 
1	0 
1	0	0	0
 
1	0s$   C) C5D)C25C>D
c                  B   t        j                  g d      ddt         j                  f   } t        g dgdd      }|j	                  |        t        g dgd      }t        j                  t        d	
      5  |j	                  |        ddd       y# 1 sw Y   yxY w)zCheck OrdinalEncoder.fit works with unseen category when
    `handle_unknown="use_encoded_value"`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    )r   r   r   r   r   r   N)rI   r   r   r<  rJ  )rS   r.   r?  r4   r  r6   r7   )r   r   newaxisr   r9   r:   r;   r<   )r   r  s     r#   -test_ordinal_encoder_fit_with_unseen_categoryr4  5  sw     	#$Q

]3A	<0CSW
B FF1I	J<	HB	z)C	D
q	 
E	D	Ds   :BBr  AAOr  rt  c                     t        dd      }|j                  |        |j                  |      }t        |ddgg       y)zChecks that `OrdinalEncoder` transforms string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    r<  ir>  r   N)r   r9   r=   r	   )r  rt  r   r   s       r#   1test_ordinal_encoder_handle_unknown_string_dtypesr8  F  s;    * (;2
NCGGGmmF#GGr1gY'r%   c                  8   t        j                  g d      j                  dd      } t               j	                  |       }t        |j                  t        j                  | d      j                         |j                  |       }t        |dgdgdgdgg       y)	zCheck that `OrdinalEncoder` accepts Python integers that are potentially
    larger than 64 bits.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/20721
    )l   	HP
1& l   	H]viel   	 :?i}Ga l   IRK2e6krI   r   r   )axisr   r   N)
r   r   rL   r   r9   r
   r   sortrU   r=   )r   r0  r   s      r#   #test_ordinal_encoder_python_integerr<  b  s     		
	 gb!n  ""1%Gw**BGGAA,>,@,@A"Gw!qcA3 45r%   c                      t        j                  d      } g d}| j                  g dg|      }t               j	                  |      }|j                         }t        ||       y)z-Check feature names out is same as the input.rY   )r[   r  rZ   r  r  N)r:   ra   rb   r   r9   r   r
   )rc   r  r   r   feature_names_outs        r#   .test_ordinal_encoder_features_names_out_pandasr?  v  sX    			X	&BE
i[%0A



q
!C113u/0r%   c                  &   t        j                  dgdgt         j                  ggt              } t	        dt         j                  d      j                  |       }|j                  |       }t        |dgdgdgg       t        j                  d	gt         j                  ggt              }|j                  |      }t        |t         j                  gdgg       |j                  |      }|d   d   J t        j                  |d   d         sJ y
)zECheck interactions between encode_unknown and missing value encoding.rZ   r[   rQ   r<  r.   r?  r  r   r   r  N)
r   r   r   r   r   r9   r=   r	   r   r	  )r   r  r   rt  r  X_roundtrips         r#   0test_ordinal_encoder_unknown_missing_interactionrD    s     	3%#)8A	*ff 
 
c!f	  ll1oGGqcA3-. XXurvvh'v6F<<'LLBFF8bT"23 &&|4K q>!$$$ 88KN1%&&&r%   with_pandasc                 t   t        j                  ddgddgdt         j                  ggt              }d}| r0t	        j
                  d      }|j                  |d	d
g      }|dz   }n|dz   }t        d      }t	        j                  t        |      5  |j                  |       ddd       y# 1 sw Y   yxY w)zWCheck OrdinalEncoder errors when encoded_missing_value is used by
    a known category.rZ   r  r[   r   r  rQ   zTencoded_missing_value \(1\) is already used to encode a known category in features: rY   letterpetr  z	\['pet'\]z\[1\]r   r!  r7   N)r   r   r   r   r:   ra   rb   r   r;   r<   r9   )rE  r   	error_msgrc   r  s        r#   0test_ordinal_encoder_encoded_missing_value_errorrJ    s     	3,esBFFm<FKA
	 
   *LLXu$5L6,	(		a	0B	z	3
q	 
4	3	3s   B..B7z4X_train, X_test_trans_expected, X_roundtrip_expected1c                    t        dt        j                  t        j                        j                  |       }t        j                  dgt        j                  gdgg      }|j                  |      }t        ||       |j                  |      }|j                  d   }t        |      D ]A  }||df   }	||df   }
|	|
J t        |	      rt        j                  |
      r9J |
|	k(  rAJ  y)znCheck transform when unknown_value and encoded_missing_value is nan.

    Non-regression test for #24082.
    r<  rB  rK  r[   r   N)r   r   r   r9   r   r=   r	   r   r   r`  r   r	  )r  X_test_trans_expectedX_roundtrip_expectedr  rt  r  rC  	n_samplesrb  expected_valvals              r#   9test_ordinal_encoder_unknown_missing_interaction_both_nanrR    s    4 
*ff ff
 
c'l	  XXurvvh./F<<'L L"78&&|4K$**1-I9+AqD1!Q$;;<(88C= =,&&& r%   c                  L   t        j                  d      } | j                  ddgddgd      }t               }|j	                  d       d}t        j
                  t        |	      5  |j                  |       d
d
d
       t        d      j	                  d      }t        d      j	                  d      }|j                  |      }|j                  |      }t        |j                         |       t        |j                         |j                         y
# 1 sw Y   xY w)z*Check OneHotEncoder works with set_output.rY   rZ   r[   r   r   r\   r  zCPandas output does not support sparse data. Set sparse_output=Falser7   NFr   default)r:   ra   rb   r   r  r;   r<   r   r	   to_numpyr
   r   r  )rc   rd   r   r8   ohe_default
ohe_pandas	X_defaultX_pandass           r#   test_one_hot_encoder_set_outputrZ    s    			X	&B<<sCj1v67D
/CNNXN&QE	z	/$ 
0  e4??)?TKU3>>>RJ))$/I''-HH%%'3z7798;K;KL 
0	/s   'DD#c                     t        j                  d      } | j                  ddgddgd      }t               j	                  d      }t               j	                  d      }|j                  |      }|j                  |      }t        |j                         |       t        |j                         |j                         y	)
z+Check OrdinalEncoder works with set_output.rY   rZ   r[   r   r   r\   rT  r  N)r:   ra   rb   r   r  r   r	   rU  r
   r   r  )rc   rd   ord_default
ord_pandasrX  rY  s         r#   test_ordinal_set_outputr^    s    			X	&B<<sCj1v67D "--	-BK!,,x,@J))$/I''-HH%%'3z7798;K;KLr%   c                     g dddgg} t        |       }|j                  ddgg       t        |       t        |j                        k(  sJ t	        |j                        D ])  \  }}|j
                  t        k(  sJ t        | |   |       + y)zjCheck that the categories_ dtype is `object` for string categories

    Regression test for gh-25171.
    )asmmaseasrasacsrK  2r   r`  N)r   r9   r  r   	enumerater*   r   r
   )rS   r   nr   s       r#    test_predefined_categories_dtyperh    s    
 6SzBJ
:
.CGGdC[Mz?c#//2222COO,3yyF""":a=#. -r%   c                  `   t        j                  dgdgt         j                  ggt              } t	        d      j                  |       }t        |dgdgdgg       t	        dd	      j                  |       }t        j                  d
gg      }|j                  |      }t        |dgg       y)zBCheck missing value or unknown encoding can equal the cardinality.r  r   rQ   r   r!  r   r   r<  r>  snakeN)	r   r   r   r   r   r   r	   r9   r=   )r   r   r   rt  s       r#   1test_ordinal_encoder_missing_unknown_encoding_maxrk  &  s    
5'E7RVVH-V<A15CCAFGGqcA3_-
(;1
M
Q
QRS
TCXXyk"FmmF#GGqcU#r%   c                  H   t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   dgdz  z   gt              j                  } t	        dd	d
      j                  |       }t        |j                         g d       |j                  d   |j                  d      dk(  sJ t        j                  dgdz  dgdz  z   dgdz  z   gt              j                  } t	        dd	d      j                  |       }t        |j                         dg       |j                  d   |j                  d      dk(  sJ t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   dgdz  z   gt              j                  } t	        dd	dg      j                  |       }t        |j                         g d       |j                  d   |j                  d      dk(  sJ t	        dd	d      j                  |       }t        |j                         g d       |j                  J y)zkCheck drop_idx is defined correctly with infrequent categories.

    Non-regression test for gh-25550.
    rZ   r   r[   r3   r  r  r  rQ   Fr   )r  r   r   )r  x0_dx0_er  r   rk   r   r  )r  r  rn  r  N)r  r  rm  rn  r  )
r   r   r   rU   r   r9   r
   r   r   r,  )r   r   s     r#   #test_drop_idx_infrequent_categoriesro  2  s&   
 	
cUQY	#	*cUQY	6#	BC6	a  au7
K
O
OPQ
RC!!#%V ??1cmmA./3666
3%!)seai'3%"*45VDFFA
au;
O
S
STU
VCs0025L4MN??1cmmA./3666

cUQY	#	*cUQY	6#	BC6	a  auC5
I
M
Ma
PC!!#%V ??1cmmA./3666
au4
H
L
LQ
OC!!#A ==   r%   c                    t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   g      j                  }t        dd	d
d| j	                  |      }t        |j                  g dg       t        |j                  ddgg       dgdgdgdgdgg}dgdgdgdgd
gg}|j                  |      }t        ||       |j                  |      }dgdgdgdgdgg}t        ||       y)zGTest parameters for grouping 'a', and 'd' into the infrequent category.rZ   r   r[   r(  r  rk   r  r   r<  rI   r>  r  r	  r   r   r   r  Nr   )r   r   rU   r   r9   r
   r   r  r=   r	   r   )r  r  ordinalrt  expected_transr   r  expected_inverses           r#   ,test_ordinal_encoder_infrequent_three_levelsrt  V  s$    hh	SEBJ.#;seaiGHIKKG *"@F	c'l  w**-A,BCw55c
|DecUSEC53%0FcA3aS2$/N'GG^,))'2I					 y"23r%   c                     t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   gt        	      j                  } t	        g d
gddd      j                  |       }t        |j                  g d
g       t        |j                  ddgg       dgdgdgdgdgg}dgdgdgdgdgg}|j                  |      }t        ||       |j                  |      }dgdgdgdgdgg}t        ||       y)zTest that the order of the categories provided by a user is respected.

    In this case 'c' is encoded as the first category and 'b' is encoded
    as the second one.
    rZ   r   r[   r(  r  rk   r  r   rQ   r  r<  rI   )rS   r  r.   r?  r	  r   r   r   r  N)r   r   r   rU   r   r9   r
   r   r  r=   r	   r   )r  rq  rt  rr  r   r  rs  s          r#   6test_ordinal_encoder_infrequent_three_levels_user_catsrv  }  s,    hh
cURZ	3%"*	,uqy	89a  ()*	
 
c'l  w**-A,BCw55c
|DecUSEC53%0FcA3aS2$/N'GG^,))'2I					 y"23r%   c                     t        j                  g dg df      } t        d      j                  |       }t	        |j
                  d   ddg       |j
                  d   J ddgddgg}ddgddgg}|j                  |      }t        ||       |j                  |      }t        j                  ddgd	dggt        
      }t	        ||       y)zETest when feature 0 has infrequent categories and feature 1 does not.r  r  r   r  r   r   r   Nr  rQ   )r   column_stackr   r9   r
   r  r=   r	   r   r   r   )r   rq  rt  rr  r   r  rs  s          r#   %test_ordinal_encoder_infrequent_mixedrz    s     	46QRSAA.2215Gw55a81a&A))!,444!fq!fF!fq!f%N'GG^,))'2Ixx!Q*>)B C6Ry"23r%   c            	      z   t        j                  d      } | j                  g d      }| j                  g dg d| j	                  dgdz  dgdz  z   d	gz   d
gz   |      dg d      }t        d      j                  |      }t        |j                  d   ddg       t        |j                  d   g d       t        |j                  d   d
d	g       | j                  g dg d| j	                  dgd	gz   d
gz   dgz   |      dg d      }g dg dg dg dg}|j                  |      }t        ||       y)zHTest infrequent categories with a pandas DataFrame with multiple dtypes.rY   )birdr   r  rj  r  r  r  r3   r   r   rj  r|  rQ   )r  rS  r  r  rx  r   rZ   r[   r   r  r   )rZ   r[   r  r  )rn   r   rk   r   )r   r   r   )r   r   r   )r   r   r   r  N)r:   ra   CategoricalDtyperb   r   r   r9   r
   r  r=   r	   )rc   categorical_dtyper   rq  rt  rr  r   s          r#   :test_ordinal_encoder_infrequent_multiple_categories_dtypesr    s[    
		X	&B++,KL
@199!ugk)WI5@' % 	
 . 	 
	A A.2215G w55a83*Ew55a8*Ew55a867:KL\\'!997)#vh.%8' % 	
 .  
F  IyAN'GG^,r%   c                     t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   t         j                  gz   gt        	      j                  } t        d
ddd      j                  |       }t        |j                  g dg       t        j                  dgdgdgdgdgt         j                  ggt        	      }dgdgdgdgdgdgg}|j                  |      }t        ||       y)zJCheck behavior of unknown_value and encoded_missing_value with infrequent.rZ   r   r[   r(  r  rk   r  r   rQ   r<  r   )r.   r?  r  r  r  r  r   r   N)r   r   r   r   rU   r   r9   r
   r  r=   r	   )r  rq  rt  rr  r   s        r#   .test_ordinal_encoder_infrequent_custom_mappingr    s    hh
cURZ	3%"*	,uqy	8BFF8	CDFa  *	
 
c'l  w557HIXXusecUSEC5266(C6RFcA3aS1#s3N'GG^,r%   c                 d   t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   gt        	      j                  }t	        di | d
ddj                  |      }t	        d
d      j                  |      }dgdgdgdgdgg}t        |j                  |      |j                  |             y)zMAll categories are considered frequent have same encoding as default encoder.rZ   r   r[   r(  r  rk   r  r   rQ   r<  rI   r>  r  Nr   r   r   r   rU   r   r9   r	   r=   )r  r  adjusted_encoderdefault_encoderrt  s        r#   !test_ordinal_encoder_all_frequentr    s     hh
cURZ	3%"*	,uqy	89a  & 
!4B	c'l  %*"	c'l  ecUSEC53%0F""6*O,E,Ef,Mr%   d   c                 "   t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   gt        	      j                  }t	        di | d
ddj                  |      }dgdgdgdgdgg}t        |j                  |      dgdgdgdgdgg       y)zAWhen all categories are infrequent, they are all encoded as zero.rZ   r   r[   r(  r  rk   r  r   rQ   r<  rI   r>  r  r   Nr   r  )r  r  r0  rt  s       r#   #test_ordinal_encoder_all_infrequentr  	  s     hh
cURZ	3%"*	,uqy	89a   
!4B	c'l  ecUSEC53%0FG%%f-aS1#sRD/IJr%   c                     t        j                  t         j                  gdz  dgdz  z   dgdz  z   dgz   dgz   gt              j                  } t        d	
      j                  |       }t        j                  dddt         j                  ggt              j                  }|j                  |      }t        |dgdgdgt         j                  gg       y)z5Check behavior when missing value appears frequently.r(  r  rk   r   r   rj  deerrQ   r   rx  r   r   r   N	r   r   r   r   rU   r   r9   r=   r	   r   rq  rt  r   s       r#   -test_ordinal_encoder_missing_appears_frequentr  *	  s    

&&B%2	%!	3wi	?6(	JK	 a  A.2215GXXrvv67vFHHF'GGqcA3bffX67r%   c            	         t        j                  t         j                  gdgdz  z   dgdz  z   dgz   dgz   dgdz  d	gdz  z   gt        
      j                  } t        d      j                  |       }t        j                  ddgdd	gt         j                  d	gdd	gddggt        
      }|j                  |      }t        |ddgddgt         j                  dgddgddgg       y)z7Check behavior when missing value appears infrequently.r  rk   r   r   rj  r  redrM  greenrQ   r3   )r  r   r   r   Nr  r  s       r#   /test_ordinal_encoder_missing_appears_infrequentr  7	  s    
 	VVHw|#ugk1WI=HGaK7)a-'	
 	 a  1-11!4GXXeWVVWGEN	
 	F 'GGq!fq!frvvqkAq6Aq6JKr%   c                     t        j                  dgdgdggt              } | g dg      }t        j                  t
              5  |j                  |       ddd       y# 1 sw Y   yxY w)a!  Check that we raise a `NotFittedError` by calling transform before fit with
    the encoders.

    One could expect that the passing the `categories` argument to the encoder
    would make it stateless. However, `fit` is making a couple of check, such as the
    position of `np.nan`.
    r]   r^   rf  rQ   re  r   N)r   r   r   r:   r;   r   r=   )r  r   r0  s      r#   test_encoder_not_fittedr  S	  sT     	3%#&f5A/!23G	~	&! 
'	&	&r   c                  j   t        j                  dgdz  dgdz  z   dgz         j                  dd      x} } t        j                  g d      j                  dd      }t        dd	d
d      }|j	                  |        t        dd	d
d      }|j	                  |        d}t        j                  t        |      5  |j                  |      }ddd       t        j                  t        |      5  |j                  |      }ddd       t        d
   d
          y# 1 sw Y   QxY w# 1 sw Y   (xY w)zo
    Check handle_unknown='warn' behave like 'infrequent_if_exist' and map
    to the infrequent category.
    
restaurantr   shopsnackrI   r   )r  r  casinor1   Fr   r   )r.   r   r  r   r0   z=unknown categories will be encoded as the infrequent categoryr7   N)
r   r   rL   r   r9   r:   rr  rs  r=   r	   )
train_data	test_dataencoder_warnencoder_infreqwarning_matchresult_infreqresult_warns          r#   9test_onehotencoder_handle_unknown_warn_maps_to_infrequentr  b	  s    !hh	fX\)WI5gb!nJ :;CCBJI U!'L Z ",	N z"SM	k	7&00; 
8 
k	7",,Y7 
8 KNM!$45 
8	7 
8	7s   ;D0D)D&)D2)r   rj  numpyr   r:   scipyr   sklearn.exceptionsr   sklearn.preprocessingr   r   sklearn.utils._missingr   sklearn.utils._testingr   r	   r
   sklearn.utils.fixesr   r$   r-   markparametrizerC   rM   r+   float32r4  rW   re   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  ra  str_r  rU   r  r  r  r  r  r  r%  r0  r6  r8  r:  rH  rS  rO  rQ  rT  rY  rc  rh  rm  rv  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r"  r%  r'  r)  r,  r1  r4  r8  r<  r?  rD  rJ  rT   rR  rZ  r^  rh  rk  ro  rt  rv  rz  r  r  r  r  r  r  r  r  r   r%   r#   <module>r     s   	     - ? 0 
 /@.3 )+TU# V#. )+TU& V&$ "((BJJ

)KL2::rzz(JK
; L M
; "((BJJ

)KLA MA92xJ$4	9
 	(+z*+#%67vF/C#78G/Cuu#=>fM"O4FC/C#67vF/Cut#<=VL			  .G/.G )+TUUDM2$1,$ 2 3 V,$^ UDM2b'Ar7QG	$y)Y&GHS\E3<'3%Fo?	
	'	 3'*7 !=>'CD
C E ?
C E?#;<1vxrxxc
';<= > = E?#;< = "+r{	#uenrd%;RZZH	Aq6Aq6"	#q!fqc]BJJ?BHHsElS%L1@3Z%!JJ	

 
C<#u.	/3*ug1FP	Aq6BFFA;'	(Arvv;*<bjjIBHHsBFFmdBFF^4FC4[266(#JJ	
 BHHsE%L)D%,+?@O4[5<.)JJ	
*	/   B7C B7" )+TU BHHsCj\022BHHsCj\022JJ		
 BHHq!fXW-//BHHq!fXW-//KHH		
 BHHsCj\022BHHsCj\022RXXo&'JJ		
 BHHtSk]&133BHHtSk]&133		
 BHHsCj\022BHHsBFFm_F355		
 BHHsDk]&133BHHsBFFm_F355		
?%L	Q  0bAc0 VdA($ ]N$CD
 E

?7 	66"#	./	()
 	&  66&0 	(+{+,#%67vF
 	'  22  BHHsCj\022BHHsCj\022JJ		
 BHHq!fXW-//BHHq!fXW-//KHH		
 BHHsCj\022BHHsCj\022RXXo&'JJ		
( 	3-  010"$- 5#,/- 0-6	868, +w!78) 9)4 2664u*FG!1 H!1H 5!*.A!BCB DB T5M'7JK'=!9?RS. T L.$ ]N$CD? E? 	1	"	$q1r2	 1E0F'GHI I	I6 +w!>?C @C. 3%#0 1" 	1	!	!	$	$q1q1QQ< 'C5!12/ 3/. 3%#0 10 !a8?A:NO;;2,6!,H;$X,v>,B bA$N#OP
$ Q
$ a1$M#NO) P)  !#M 'BC* D*6* 2664.9G :G,. )+TU((;<, = V,< )+TU)B V)BX )+TU(E V(EV )+TU) V)B( 02662,?" @"" ((;<02662,?% @ =%>  3-7993*V4663RVV,F;<

	 3-7993*V4663RVV,F;<

	 3-

;==3%

3553RVV,-.

	%4	9  !DE!D$ ]N$CD E ! BHHsBFFC()*,,BHHsBFFC()*,,BHHseW	
 BHHo&'))BHHo&'))BHHrvvhZ 	
 BHHsBFFC()8::BHHsBFFC()*,,BHHseWF+	
 BHHo&f577BHHo&'))BHHrvvhZv.	
!24324 .92 :2," 
4+c*4+c* 
s3*S)3*S)	( 	(6(	1'< u6 72 :
 BHHsecU^62S266(RVVH%BJJvv.f=	
 BHHrvvhu-V<S266(RVVH%BJJx"&&2&A	
&''&'BM.M /"	$!!H 	1	!	!	$	$q1q1446!4H4*--`-* 	1	!( 	1	#
K
K
8L8 ]N$CD E!6r%   