
    P3j                     >   d Z ddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZmZmZmZmZmZ ddlmZmZmZmZmZmZ ddlmZ ddlmZ ddlmZmZm Z  dd	l!m"Z" dd
l#m$Z$  G d d      Z% G d de%      Z& G d de%      Z' G d de%      Z( G d de%      Z) G d de%      Z* G d de%      Z+ G d de%      Z, G d de%      Z- G d de%      Z. G d d e%      Z/ G d! d"e%      Z0e&e'e(e)e*e+e,e.e/e0d#
Z1 G d$ d%      Z2d& Z3 G d' d(e2e.      Z4 G d) d*e2e/      Z5 G d+ d,e2e*      Z6y)-z
This module contains loss classes suitable for fitting.

It is not part of the public API.
Specific losses are used for regression, binary classification or multiclass
classification.
    Nxlogy)CyAbsoluteErrorCyExponentialLossCyHalfBinomialLossCyHalfGammaLossCyHalfMultinomialLossCyHalfPoissonLossCyHalfSquaredErrorCyHalfTweedieLossCyHalfTweedieLossIdentityCyHuberLossCyPinballLoss)HalfLogitLinkIdentityLinkInterval	LogitLinkLogLinkMultinomialLogit)one_hot)check_scalar)_average
_logsumexp_ravel)softmax)_weighted_percentilec                       e Zd ZdZdZdZddZd Zd Z	 	 	 ddZ		 	 	 	 dd	Z
	 	 	 dd
Z	 	 	 	 ddZddZddZddZej"                  dfdZy)BaseLossa  Base class for a loss function of 1-dimensional targets.

    Conventions:

        - y_true.shape = sample_weight.shape = (n_samples,)
        - y_pred.shape = raw_prediction.shape = (n_samples,)
        - If is_multiclass is true (multiclass classification), then
          y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
          Note that this corresponds to the return value of decision_function.

    y_true, y_pred, sample_weight and raw_prediction must either be all float64
    or all float32.
    gradient and hessian must be either both float64 or both float32.

    Note that y_pred = link.inverse(raw_prediction).

    Specific loss classes can inherit specific link classes to satisfy
    BaseLink's abstractmethods.

    Parameters
    ----------
    closs: CyLossFunction
        For example, a CyLossFunction; hence the name "c"loss.
    link : BaseLink
    sample_weight : {None, ndarray}
        If sample_weight is None, the hessian might be constant.
    n_classes : {None, int}
        The number of classes for classification, else None.
    xp : module, default=None
        Array namespace module.
    device : device, default=None
        A device object (see the "Device Support" section of the array API spec).

    Attributes
    ----------
    closs: CyLossFunction
        For example, a CyLossFunction; hence the name "c"loss.
    link : BaseLink
    n_classes : {None, int}
        The number of classes for classification, else None.
    xp : module or None
        Array namespace module. Ignored by the Cython implementation.
    device : device or None
        A device object. Ignored by the Cython implementation.
    interval_y_true : Interval
        Valid interval for y_true
    interval_y_pred : Interval
        Valid Interval for y_pred
    differentiable : bool
        Indicates whether or not loss function is differentiable in
        raw_prediction everywhere.
    approx_hessian : bool
        Indicates whether the hessian is approximated or exact. If,
        approximated, it should be larger or equal to the exact one.
    constant_hessian : bool
        Indicates whether the hessian is one for this loss.
    is_multiclass : bool
        Indicates whether n_classes > 2 is allowed.
    TFNc                     || _         || _        || _        || _        || _        d| _        d| _        t        t        j                   t        j                  dd      | _
        | j                  j                  | _        y )NF)closslink	n_classesxpdeviceapprox_hessianconstant_hessianr   npinfinterval_y_trueinterval_y_pred)selfr    r!   r"   r#   r$   s         ?/DATA/.local/lib/python3.12/site-packages/sklearn/_loss/loss.py__init__zBaseLoss.__init__   sd    
	"# %'F#yy88    c                 8    | j                   j                  |      S zuReturn True if y is in the valid range of y_true.

        Parameters
        ----------
        y : ndarray
        )r)   includesr+   ys     r,   in_y_true_rangezBaseLoss.in_y_true_range        ##,,Q//r.   c                 8    | j                   j                  |      S )zuReturn True if y is in the valid range of y_pred.

        Parameters
        ----------
        y : ndarray
        )r*   r1   r2   s     r,   in_y_pred_rangezBaseLoss.in_y_pred_range   r5   r.   c                     |t        j                  |      }|j                  dk(  r#|j                  d   dk(  r|j	                  d      }| j
                  j                  |||||       |S )aJ  Compute the pointwise loss value for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        loss_out : None or C-contiguous array of shape (n_samples,)
            A location into which the result is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        loss : array of shape (n_samples,)
            Element-wise loss function.
              y_trueraw_predictionsample_weightloss_out	n_threads)r'   
empty_likendimshapesqueezer    lossr+   r<   r=   r>   r?   r@   s         r,   rE   zBaseLoss.loss   ss    < }}V,H!#(<(<Q(?1(D+33A6N

)' 	 	
 r.   c                    |O|+t        j                  |      }t        j                  |      }nEt        j                  ||j                        }n#|!t        j                  ||j                        }|j                  dk(  r#|j                  d   dk(  r|j                  d      }|j                  dk(  r#|j                  d   dk(  r|j                  d      }| j                  j                  ||||||       ||fS )a  Compute loss and gradient w.r.t. raw_prediction for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        loss_out : None or C-contiguous array of shape (n_samples,)
            A location into which the loss is stored. If None, a new array
            might be created.
        gradient_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the gradient is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        loss : array of shape (n_samples,)
            Element-wise loss function.

        gradient : array of shape (n_samples,) or (n_samples, n_classes)
            Element-wise gradients.
        dtyper9   r:   )r<   r=   r>   r?   gradient_outr@   )r'   rA   rI   rB   rC   rD   r    loss_gradient)r+   r<   r=   r>   r?   rJ   r@   s          r,   rK   zBaseLoss.loss_gradient   s    L #==0!}}^<==|7I7IJ!==x~~NL !#(<(<Q(?1(D+33A6N!l&8&8&;q&@'//2L

  )'% 	! 	
 %%r.   c                 <   |t        j                  |      }|j                  dk(  r#|j                  d   dk(  r|j	                  d      }|j                  dk(  r#|j                  d   dk(  r|j	                  d      }| j
                  j                  |||||       |S )a  Compute gradient of loss w.r.t raw_prediction for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        gradient_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the result is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        gradient : array of shape (n_samples,) or (n_samples, n_classes)
            Element-wise gradients.
        r9   r:   )r<   r=   r>   rJ   r@   )r'   rA   rB   rC   rD   r    gradientr+   r<   r=   r>   rJ   r@   s         r,   rM   zBaseLoss.gradient  s    > ==8L !#(<(<Q(?1(D+33A6N!l&8&8&;q&@'//2L

)'% 	 	
 r.   c                 0   |C|+t        j                  |      }t        j                  |      }n-t        j                  |      }n|t        j                  |      }|j                  dk(  r#|j                  d   dk(  r|j	                  d      }|j                  dk(  r#|j                  d   dk(  r|j	                  d      }|j                  dk(  r#|j                  d   dk(  r|j	                  d      }| j
                  j                  ||||||       ||fS )a  Compute gradient and hessian of loss w.r.t raw_prediction.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        gradient_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the gradient is stored. If None, a new array
            might be created.
        hessian_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the hessian is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        gradient : arrays of shape (n_samples,) or (n_samples, n_classes)
            Element-wise gradients.

        hessian : arrays of shape (n_samples,) or (n_samples, n_classes)
            Element-wise hessians.
        r9   r:   )r<   r=   r>   rJ   hessian_outr@   )r'   rA   rB   rC   rD   r    gradient_hessian)r+   r<   r=   r>   rJ   rP   r@   s          r,   rQ   zBaseLoss.gradient_hessianP  s   N "!}}^< mmN;!}}[9 --5K !#(<(<Q(?1(D+33A6N!l&8&8&;q&@'//2Lq [%6%6q%9Q%>%--a0K

##)'%# 	$ 	
 [((r.   c           	      X    t        j                  | j                  ||dd|      |      S )a{  Compute the weighted average loss.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        loss : float
            Mean or averaged loss function.
        Nr;   weights)r'   averagerE   )r+   r<   r=   r>   r@   s        r,   __call__zBaseLoss.__call__  s:    ( zzII-"#   "	
 		
r.   c                    t        j                  ||d      }dt        j                  |j                        j                  z  }| j
                  j                  t         j                   k(  rd}nF| j
                  j                  r| j
                  j                  }n| j
                  j                  |z   }| j
                  j                  t         j                  k(  rd}nF| j
                  j                  r| j
                  j                  }n| j
                  j                  |z
  }||| j                  j                  |      S | j                  j                  t        j                  |||            S )a#  Compute raw_prediction of an intercept-only model.

        This can be used as initial estimates of predictions, i.e. before the
        first iteration in fit.

        Parameters
        ----------
        y_true : array-like of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or array of shape (n_samples,)
            Sample weights.

        Returns
        -------
        raw_prediction : numpy scalar or array of shape (n_classes,)
            Raw predictions of an intercept-only model.
        r   rT   axis
   N)r'   rU   finforI   epsr*   lowr(   low_inclusivehighhigh_inclusiver!   clip)r+   r<   r>   y_predr\   a_mina_maxs          r,   fit_intercept_onlyzBaseLoss.fit_intercept_only  s   ( FMB288FLL)---##w.E!!//((,,E((,,s2E$$.E!!00((--E((--3E=U]99>>&))99>>"''&%"?@@r.   c                 ,    t        j                  |      S )a(  Calculate term dropped in loss.

        With this term added, the loss of perfect predictions is zero.

        Parameters
        ----------
        y_true : array-like of shape (n_samples,)
            Observed, true target values.

        sample_weight : None or array of shape (n_samples,), default=None
            Sample weights.

        Returns
        -------
        constant : ndarray of shape (n_samples,)
            Constant value to be added to raw predictions so that the loss
            of perfect predictions becomes zero.
        )r'   
zeros_liker+   r<   r>   s      r,   constant_to_optimal_zeroz!BaseLoss.constant_to_optimal_zero  s    & }}V$$r.   Fc                 V   |t         j                  t         j                  fvrt        d| d      | j                  r|| j
                  f}n|f}t        j                  |||      }| j                  rt        j                  d|      }||fS t        j                  |||      }||fS )au  Initialize arrays for gradients and hessians.

        Unless hessians are constant, arrays are initialized with undefined values.

        Parameters
        ----------
        n_samples : int
            The number of samples, usually passed to `fit()`.
        dtype : {np.float64, np.float32}, default=np.float64
            The dtype of the arrays gradient and hessian.
        order : {'C', 'F'}, default='F'
            Order of the arrays gradient and hessian. The default 'F' makes the arrays
            contiguous along samples.

        Returns
        -------
        gradient : C-contiguous array of shape (n_samples,) or array of shape             (n_samples, n_classes)
            Empty array (allocated but not initialized) to be used as argument
            gradient_out.
        hessian : C-contiguous array of shape (n_samples,), array of shape
            (n_samples, n_classes) or shape (1,)
            Empty (allocated but not initialized) array to be used as argument
            hessian_out.
            If constant_hessian is True (e.g. `HalfSquaredError`), the array is
            initialized to ``1``.
        zCValid options for 'dtype' are np.float32 and np.float64. Got dtype=z	 instead.)rC   rI   order)r:   )rC   rI   )	r'   float32float64
ValueErroris_multiclassr"   emptyr&   ones)r+   	n_samplesrI   rl   rC   rM   hessians          r,   init_gradient_and_hessianz"BaseLoss.init_gradient_and_hessian  s    8 RZZ00"G9. 
 /ELE88%uEB  
 ggD6G    hhU%uEG  r.   NNNNNr:   NNNr:   Nr:   N)__name__
__module____qualname____doc__differentiablerp   r-   r4   r7   rE   rK   rM   rQ   rV   re   ri   r'   rn   ru    r.   r,   r   r   M   s    :J NM	900 +b =&F /j @)D
>(AT%* :<3 1!r.   r   c                   $     e Zd ZdZd fd	Z xZS )HalfSquaredErrora  Half squared error with identity link, for regression.

    Domain:
    y_true and y_pred all real numbers

    Link:
    y_pred = raw_prediction

    For a given sample x_i, half squared error is defined as::

        loss(x_i) = 0.5 * (y_true_i - raw_prediction_i)**2

    The factor of 0.5 simplifies the computation of gradients and results in a
    unit hessian (and is consistent with what is done in LightGBM). It is also
    half the Normal distribution deviance.
    c                 ^    t         |   t               t               ||       |d u | _        y )Nr    r!   r#   r$   )superr-   r   r   r&   r+   r>   r#   r$   	__class__s       r,   r-   zHalfSquaredError.__init__6  s2    $&\^6 	 	
 !. 5r.   rv   r{   r|   r}   r~   r-   __classcell__r   s   @r,   r   r   $  s    "6 6r.   r   c                   0     e Zd ZdZdZd fd	ZddZ xZS )AbsoluteErrora  Absolute error with identity link, for regression.

    Domain:
    y_true and y_pred all real numbers

    Link:
    y_pred = raw_prediction

    For a given sample x_i, the absolute error is defined as::

        loss(x_i) = |y_true_i - raw_prediction_i|

    Note that the exact hessian = 0 almost everywhere (except at one point, therefore
    differentiable = False). Optimization routines like in HGBT, however, need a
    hessian > 0. Therefore, we assign 1.
    Fc                 l    t         |   t               t               ||       d| _        |d u | _        y )Nr   T)r   r-   r   r   r%   r&   r   s       r,   r-   zAbsoluteError.__init__Q  s:    !#,.R 	 	
 # - 5r.   c                 N    |t        j                  |d      S t        ||d      S )Compute raw_prediction of an intercept-only model.

        This is the weighted median of the target, i.e. over the samples
        axis=0.
        r   rY   2   )r'   medianr   rh   s      r,   re   z AbsoluteError.fit_intercept_onlyX  s*      99V!,,'rBBr.   rv   rz   r{   r|   r}   r~   r   r-   re   r   r   s   @r,   r   r   =  s    " N6	Cr.   r   c                   0     e Zd ZdZdZd fd	ZddZ xZS )PinballLossa  Quantile loss aka pinball loss, for regression.

    Domain:
    y_true and y_pred all real numbers
    quantile in (0, 1)

    Link:
    y_pred = raw_prediction

    For a given sample x_i, the pinball loss is defined as::

        loss(x_i) = rho_{quantile}(y_true_i - raw_prediction_i)

        rho_{quantile}(u) = u * (quantile - 1_{u<0})
                          = -u *(1 - quantile)  if u < 0
                             u * quantile       if u >= 0

    Note: 2 * PinballLoss(quantile=0.5) equals AbsoluteError().

    Note that the exact hessian = 0 almost everywhere (except at one point, therefore
    differentiable = False). Optimization routines like in HGBT, however, need a
    hessian > 0. Therefore, we assign 1.

    Additional Attributes
    ---------------------
    quantile : float
        The quantile level of the quantile to be estimated. Must be in range (0, 1).
    Fc                     t        |dt        j                  ddd       t        |   t        t        |            t               ||       d| _        |d u | _	        y )	Nquantiler   r:   neithertarget_typemin_valmax_valinclude_boundaries)r   r   T)
r   numbersRealr   r-   r   floatr   r%   r&   )r+   r>   r   r#   r$   r   s        r,   r-   zPinballLoss.__init__  sc    (	
 	x9	 	 	
 # - 5r.   c                     |/t        j                  |d| j                  j                  z  d      S t	        ||d| j                  j                  z        S )r   d   r   r   )r'   
percentiler    r   r   rh   s      r,   re   zPinballLoss.fit_intercept_only  sO      ==tzz/B/B)BKK'sTZZ-@-@'@ r.   )N      ?NNrz   r   r   s   @r,   r   r   d  s    : N6$r.   r   c                   2     e Zd ZdZdZ	 d fd	ZddZ xZS )	HuberLossa  Huber loss, for regression.

    Domain:
    y_true and y_pred all real numbers
    quantile in (0, 1)

    Link:
    y_pred = raw_prediction

    For a given sample x_i, the Huber loss is defined as::

        loss(x_i) = 1/2 * abserr**2            if abserr <= delta
                    delta * (abserr - delta/2) if abserr > delta

        abserr = |y_true_i - raw_prediction_i|
        delta = quantile(abserr, self.quantile)

    Note: HuberLoss(quantile=1) equals HalfSquaredError and HuberLoss(quantile=0)
    equals delta * (AbsoluteError() - delta/2).

    Additional Attributes
    ---------------------
    quantile : float
        The quantile level which defines the breaking point `delta` to distinguish
        between absolute error and squared error. Must be in range (0, 1).

     Reference
    ---------
    .. [1] Friedman, J.H. (2001). :doi:`Greedy function approximation: A gradient
      boosting machine <10.1214/aos/1013203451>`.
      Annals of Statistics, 29, 1189-1232.
    Fc                     t        |dt        j                  ddd       || _        t        |   t        t        |            t               ||       d| _	        d	| _
        y )
Nr   r   r:   r   r   )deltar   TF)r   r   r   r   r   r-   r   r   r   r%   r&   )r+   r>   r   r   r#   r$   r   s         r,   r-   zHuberLoss.__init__  sg     	(	
 !E%L1	 	 	
 # %r.   c                 6   |t        j                  |dd      }nt        ||d      }||z
  }t        j                  |      t        j                  | j
                  j                  t        j                  |            z  }|t        j                  ||      z   S )r   r   r   r   rS   )	r'   r   r   signminimumr    r   absrU   )r+   r<   r>   r   diffterms         r,   re   zHuberLoss.fit_intercept_only  sx      ]]62A6F)&-DFwwt}rzz$***:*:BFF4LII

4???r.   )Ng?r   NNrz   r   r   s   @r,   r   r     s"    B N LP&*@r.   r   c                   ,     e Zd ZdZd fd	ZddZ xZS )HalfPoissonLossa  Half Poisson deviance loss with log-link, for regression.

    Domain:
    y_true in non-negative real numbers
    y_pred in positive real numbers

    Link:
    y_pred = exp(raw_prediction)

    For a given sample x_i, half the Poisson deviance is defined as::

        loss(x_i) = y_true_i * log(y_true_i/exp(raw_prediction_i))
                    - y_true_i + exp(raw_prediction_i)

    Half the Poisson deviance is actually the negative log-likelihood up to
    constant terms (not involving raw_prediction) and simplifies the
    computation of the gradients.
    We also skip the constant term `y_true_i * log(y_true_i) - y_true_i`.
    c                     t         |   t               t               ||       t	        dt
        j                  dd      | _        y )Nr   r   TF)r   r-   r
   r   r   r'   r(   r)   r   s       r,   r-   zHalfPoissonLoss.__init__  s<    #%GI"V 	 	
  (2664?r.   c                 2    t        ||      |z
  }|||z  }|S rz   r   r+   r<   r>   r   s       r,   ri   z(HalfPoissonLoss.constant_to_optimal_zero  s(    VV$v-$M!Dr.   rv   rz   r{   r|   r}   r~   r-   ri   r   r   s   @r,   r   r     s    (@r.   r   c                   ,     e Zd ZdZd fd	ZddZ xZS )HalfGammaLossaV  Half Gamma deviance loss with log-link, for regression.

    Domain:
    y_true and y_pred in positive real numbers

    Link:
    y_pred = exp(raw_prediction)

    For a given sample x_i, half Gamma deviance loss is defined as::

        loss(x_i) = log(exp(raw_prediction_i)/y_true_i)
                    + y_true/exp(raw_prediction_i) - 1

    Half the Gamma deviance is actually proportional to the negative log-
    likelihood up to constant terms (not involving raw_prediction) and
    simplifies the computation of the gradients.
    We also skip the constant term `-log(y_true_i) - 1`.
    c                     t         |   t               t               ||       t	        dt
        j                  dd      | _        y )Nr   r   F)r   r-   r   r   r   r'   r(   r)   r   s       r,   r-   zHalfGammaLoss.__init__&  s6    0wyRPVW'2665%@r.   c                 F    t        j                  |       dz
  }|||z  }|S ry   )r'   logr   s       r,   ri   z&HalfGammaLoss.constant_to_optimal_zero*  s+    v"$M!Dr.   rv   rz   r   r   s   @r,   r   r     s    &Ar.   r   c                   ,     e Zd ZdZd fd	ZddZ xZS )HalfTweedieLossa  Half Tweedie deviance loss with log-link, for regression.

    Domain:
    y_true in real numbers for power <= 0
    y_true in non-negative real numbers for 0 < power < 2
    y_true in positive real numbers for 2 <= power
    y_pred in positive real numbers
    power in real numbers

    Link:
    y_pred = exp(raw_prediction)

    For a given sample x_i, half Tweedie deviance loss with p=power is defined
    as::

        loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
                    - y_true_i * exp(raw_prediction_i)**(1-p) / (1-p)
                    + exp(raw_prediction_i)**(2-p) / (2-p)

    Taking the limits for p=0, 1, 2 gives HalfSquaredError with a log link,
    HalfPoissonLoss and HalfGammaLoss.

    We also skip constant terms, but those are different for p=0, 1, 2.
    Therefore, the loss is not continuous in `power`.

    Note furthermore that although no Tweedie distribution exists for
    0 < power < 1, it still gives a strictly consistent scoring function for
    the expectation.
    c                    t         |   t        t        |            t	               ||       | j
                  j                  dk  r1t        t        j                   t        j                  dd      | _
        y | j
                  j                  dk  r"t        dt        j                  dd      | _
        y t        dt        j                  dd      | _
        y N)powerr   r   Fr9   T)r   r-   r   r   r   r    r   r   r'   r(   r)   r+   r>   r   r#   r$   r   s        r,   r-   zHalfTweedieLoss.__init__P  s    #%,7	 	 	
 ::q #+RVVGRVVUE#JD ZZ!#+ArvvtU#CD #+Arvvue#DD r.   c                    | j                   j                  dk(  rt               j                  ||      S | j                   j                  dk(  rt	               j                  ||      S | j                   j                  dk(  rt               j                  ||      S | j                   j                  }t        j                  t        j                  |d      d|z
        d|z
  z  d|z
  z  }|||z  }|S )Nr   )r<   r>   r:   r9   )r    r   r   ri   r   r   r'   maximum)r+   r<   r>   pr   s        r,   ri   z(HalfTweedieLoss.constant_to_optimal_zero^  s    ::q #%>>] ?   ZZ""$==] >   ZZ" ?;;] <   

  A88BJJvq11q59QUCq1uMD(%Kr.   Ng      ?NNrz   r   r   s   @r,   r   r   1  s    <Er.   r   c                   $     e Zd ZdZd fd	Z xZS )HalfTweedieLossIdentityan  Half Tweedie deviance loss with identity link, for regression.

    Domain:
    y_true in real numbers for power <= 0
    y_true in non-negative real numbers for 0 < power < 2
    y_true in positive real numbers for 2 <= power
    y_pred in positive real numbers for power != 0
    y_pred in real numbers for power = 0
    power in real numbers

    Link:
    y_pred = raw_prediction

    For a given sample x_i, half Tweedie deviance loss with p=power is defined
    as::

        loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
                    - y_true_i * raw_prediction_i**(1-p) / (1-p)
                    + raw_prediction_i**(2-p) / (2-p)

    Note that the minimum value of this loss is 0.

    Note furthermore that although no Tweedie distribution exists for
    0 < power < 1, it still gives a strictly consistent scoring function for
    the expectation.
    c                    t         |   t        t        |            t	               ||       | j
                  j                  dk  r1t        t        j                   t        j                  dd      | _
        n\| j
                  j                  dk  r"t        dt        j                  dd      | _
        n!t        dt        j                  dd      | _
        | j
                  j                  dk(  r1t        t        j                   t        j                  dd      | _        y t        dt        j                  dd      | _        y r   )r   r-   r   r   r   r    r   r   r'   r(   r)   r*   r   s        r,   r-   z HalfTweedieLossIdentity.__init__  s    +%,?	 	 	
 ::q #+RVVGRVVUE#JD ZZ!#+ArvvtU#CD #+Arvvue#DD ::q #+RVVGRVVUE#JD #+Arvvue#DD r.   r   r   r   s   @r,   r   r   s  s    6E Er.   r   c                   2     e Zd ZdZd fd	ZddZd Z xZS )HalfBinomialLossaY  Half Binomial deviance loss with logit link, for binary classification.

    This is also know as binary cross entropy, log-loss and logistic loss.

    Domain:
    y_true in [0, 1], i.e. regression on the unit interval
    y_pred in (0, 1), i.e. boundaries excluded

    Link:
    y_pred = expit(raw_prediction)

    For a given sample x_i, half Binomial deviance is defined as the negative
    log-likelihood of the Binomial/Bernoulli distribution and can be expressed
    as::

        loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i

    See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,
    section 4.4.1 (about logistic regression).

    Note that the formulation works for classification, y = {0, 1}, as well as
    logistic regression, y = [0, 1].
    If you add `constant_to_optimal_zero` to the loss, you get half the
    Bernoulli/binomial deviance.

    More details: Inserting the predicted probability y_pred = expit(raw_prediction)
    in the loss gives the well known::

        loss(x_i) = - y_true_i * log(y_pred_i) - (1 - y_true_i) * log(1 - y_pred_i)
    c                 t    t         |   t               t               d||       t	        dddd      | _        y Nr9   r    r!   r"   r#   r$   r   r:   T)r   r-   r   r   r   r)   r   s       r,   r-   zHalfBinomialLoss.__init__  s>    $& 	 	
  (1dD9r.   c                 R    t        ||      t        d|z
  d|z
        z   }|||z  }|S ry   r   r   s       r,   ri   z)HalfBinomialLoss.constant_to_optimal_zero  s7    VV$uQZV'DD$M!Dr.   c                 4   |j                   dk(  r#|j                  d   dk(  r|j                  d      }t        j                  |j                  d   df|j
                        }| j                  j                  |      |dddf<   d|dddf   z
  |dddf<   |S a=  Predict probabilities.

        Parameters
        ----------
        raw_prediction : array of shape (n_samples,) or (n_samples, 1)
            Raw prediction values (in link space).

        Returns
        -------
        proba : array of shape (n_samples, 2)
            Element-wise class probabilities.
        r9   r:   r   rH   NrB   rC   rD   r'   rq   rI   r!   inverser+   r=   probas      r,   predict_probazHalfBinomialLoss.predict_proba       !#(<(<Q(?1(D+33A6N...q115^=Q=QRii''7ad%1+oadr.   rv   rz   r{   r|   r}   r~   r-   ri   r   r   r   s   @r,   r   r     s    >:r.   r   c                   L     e Zd ZdZdZd fd	Zd Zd	dZd Z	 	 	 	 d
dZ	 xZ
S )HalfMultinomialLossa/  Categorical cross-entropy loss, for multiclass classification.

    Domain:
    y_true in {0, 1, 2, 3, .., n_classes - 1}
    y_pred has n_classes elements, each element in (0, 1)

    Link:
    y_pred = softmax(raw_prediction)

    Note: We assume y_true to be already label encoded. The inverse link is
    softmax. But the full link function is the symmetric multinomial logit
    function.

    For a given sample x_i, the categorical cross-entropy loss is defined as
    the negative log-likelihood of the multinomial distribution, it
    generalizes the binary cross-entropy to more than 2 classes::

        loss_i = log(sum(exp(raw_pred_{i, k}), k=0..n_classes-1))
                - sum(y_true_{i, k} * raw_pred_{i, k}, k=0..n_classes-1)

    See [1].

    Note that for the hessian, we calculate only the diagonal part in the
    classes: If the full hessian for classes k and l and sample i is H_i_k_l,
    we calculate H_i_k_k, i.e. k=l.

    Parameters
    ----------
    sample_weight : {None, ndarray}
        If sample_weight is None, the hessian might be constant.

    n_classes : {None, int}
        The number of classes for classification, else None.

    xp : module or None
        Array namespace module. Ignored by the Cython implementation.

    device : device or None
        A device object. Ignored by the Cython implementation.

    References
    ----------
    .. [1] :arxiv:`Simon, Noah, J. Friedman and T. Hastie.
        "A Blockwise Descent Algorithm for Group-penalized Multiresponse and
        Multinomial Regression".
        <1311.6529>`
    Tc                     t         |   t               t               |||       t	        dt
        j                  dd      | _        t	        dddd      | _        d | _	        d | _
        d | _        y )Nr   r   TFr:   )r   r-   r	   r   r   r'   r(   r)   r*   class_indexing_offsets
y_true_inty_true_one_hotr+   r>   r"   r#   r$   r   s        r,   r-   zHalfMultinomialLoss.__init__  so    ')!# 	 	
  (2664?'1eU; '+#"r.   c                     | j                   j                  |      xr+ t        j                  |j	                  t
              |k(        S r0   )r)   r1   r'   allastypeintr2   s     r,   r4   z#HalfMultinomialLoss.in_y_true_range.  s6     ##,,Q/NBFF188C=A;M4NNr.   c                    t        j                  | j                  |j                        }t        j                  |j                        j
                  }t        | j                        D ]@  }t        j                  ||k(  |d      ||<   t        j                  ||   |d|z
        ||<   B | j                  j                  |dddf         j                  d      S )a-  Compute raw_prediction of an intercept-only model.

        This is the softmax of the weighted average of the target, i.e. over
        the samples axis=0.

        Parameters
        ----------
        y_true : array-like of shape (n_samples,)
            Observed, true target values.

        sample_weight : None or array of shape (n_samples,), default=None
            Sample weights.

        Returns
        -------
        raw_prediction : numpy scalar or array of shape (n_classes,)
            Raw predictions of an intercept-only model.
        rH   r   rX   r:   N)r'   zerosr"   rI   r[   r\   rangerU   ra   r!   reshape)r+   r<   r>   outr\   ks         r,   re   z&HalfMultinomialLoss.fit_intercept_only7  s    & hht~~V\\:hhv||$((t~~&AZZ!]KCFWWSVS!c'2CF ' yy~~c$'l+33B77r.   c                 8    | j                   j                  |      S )a=  Predict probabilities.

        Parameters
        ----------
        raw_prediction : array of shape (n_samples, n_classes)
            Raw prediction values (in link space).

        Returns
        -------
        proba : array of shape (n_samples, n_classes)
            Element-wise class probabilities.
        )r!   r   )r+   r=   s     r,   r   z!HalfMultinomialLoss.predict_probaQ  s     yy  00r.   c                    |C|+t        j                  |      }t        j                  |      }n-t        j                  |      }n|t        j                  |      }| j                  j                  ||||||       ||fS )aK  Compute gradient and class probabilities fow raw_prediction.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : array of shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        gradient_out : None or array of shape (n_samples, n_classes)
            A location into which the gradient is stored. If None, a new array
            might be created.
        proba_out : None or array of shape (n_samples, n_classes)
            A location into which the class probabilities are stored. If None,
            a new array might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        gradient : array of shape (n_samples, n_classes)
            Element-wise gradients.

        proba : array of shape (n_samples, n_classes)
            Element-wise class probabilities.
        )r<   r=   r>   rJ   	proba_outr@   )r'   rA   r    gradient_proba)r+   r<   r=   r>   rJ   r   r@   s          r,   r   z"HalfMultinomialLoss.gradient_proba`  s    H  !}}^<MM.9	!}}Y7l3I

!!)'% 	" 	
 Y&&r.   N   NNrz   rx   )r{   r|   r}   r~   rp   r-   r4   re   r   r   r   r   s   @r,   r   r     s8    .` M#"O841& 5'r.   r   c                   2     e Zd ZdZd fd	ZddZd Z xZS )ExponentialLossa"  Exponential loss with (half) logit link, for binary classification.

    This is also know as boosting loss.

    Domain:
    y_true in [0, 1], i.e. regression on the unit interval
    y_pred in (0, 1), i.e. boundaries excluded

    Link:
    y_pred = expit(2 * raw_prediction)

    For a given sample x_i, the exponential loss is defined as::

        loss(x_i) = y_true_i * exp(-raw_pred_i)) + (1 - y_true_i) * exp(raw_pred_i)

    See:
    - J. Friedman, T. Hastie, R. Tibshirani.
      "Additive logistic regression: a statistical view of boosting (With discussion
      and a rejoinder by the authors)." Ann. Statist. 28 (2) 337 - 407, April 2000.
      https://doi.org/10.1214/aos/1016218223
    - A. Buja, W. Stuetzle, Y. Shen. (2005).
      "Loss Functions for Binary Class Probability Estimation and Classification:
      Structure and Applications."

    Note that the formulation works for classification, y = {0, 1}, as well as
    "exponential logistic" regression, y = [0, 1].
    Note that this is a proper scoring rule, but without it's canonical link.

    More details: Inserting the predicted probability
    y_pred = expit(2 * raw_prediction) in the loss gives::

        loss(x_i) = y_true_i * sqrt((1 - y_pred_i) / y_pred_i)
            + (1 - y_true_i) * sqrt(y_pred_i / (1 - y_pred_i))
    c                 t    t         |   t               t               d||       t	        dddd      | _        y r   )r   r-   r   r   r   r)   r   s       r,   r-   zExponentialLoss.__init__  s>    #% 	 	
  (1dD9r.   c                 P    dt        j                  |d|z
  z        z  }|||z  }|S )Nr:   )r'   sqrtr   s       r,   ri   z(ExponentialLoss.constant_to_optimal_zero  s3    BGGFa&j122$M!Dr.   c                 4   |j                   dk(  r#|j                  d   dk(  r|j                  d      }t        j                  |j                  d   df|j
                        }| j                  j                  |      |dddf<   d|dddf   z
  |dddf<   |S r   r   r   s      r,   r   zExponentialLoss.predict_proba  r   r.   rv   rz   r   r   s   @r,   r   r     s    !F:r.   r   )
squared_errorabsolute_errorpinball_loss
huber_losspoisson_loss
gamma_losstweedie_lossbinomial_lossmultinomial_lossexponential_lossc                   H    e Zd ZdZ	 	 ddZ	 	 	 ddZ	 	 	 	 d	dZ	 	 	 ddZy)
ArrayAPILossMixina  Mixin for loss classes that are compatible with the array API.

    Currently this mixin redefines methods:
    - __call__(...)
    - loss(...)
    - loss_gradient(...)
    - gradient(...)

    such that they work according to the array API specification.
    It uses the attributes self.xp and self.device from BaseLoss and it assumes that
    methods self._compute_loss and self._compute_gradient are implemented.
    Nc                 l    | j                  ||d      }t        t        ||| j                              S )a  Compute the weighted average loss for the array API losses.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        n_threads : int, default=1
            Ignored by the array API implementation.

        Returns
        -------
        loss : float
            Mean or averaged loss function.
        Nr<   r=   r>   )rT   r#   )rE   r   r   r#   )r+   r<   r=   r>   r@   loss_xps         r,   rV   zArrayAPILossMixin.__call__  s8    4 )).  
 Xg}IJJr.   c                 *    | j                  |||      S )a  Compute the pointwise loss value for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        loss_out : None or C-contiguous array of shape (n_samples,)
            Ignored by the array API implementation.
        n_threads : int, default=1
            Ignored by the array API implementation.

        Returns
        -------
        loss : array of shape (n_samples,)
            Element-wise loss function.
        r  )_compute_lossrF   s         r,   rE   zArrayAPILossMixin.loss  s%    : !!)' " 
 	
r.   c                 Z    | j                  |||      }| j                  |||      }||fS )aG  Compute loss and gradient w.r.t. raw_prediction for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        loss_out : None or C-contiguous array of shape (n_samples,)
            Ignored by the array API implementation.
        gradient_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            Ignored by the array API implementation.
        n_threads : int, default=1
            Ignored by the array API implementation.

        Returns
        -------
        loss : array of shape (n_samples,)
            Element-wise loss function.

        gradient : array of shape (n_samples,) or (n_samples, n_classes)
            Element-wise gradients.
        r  )r  _compute_gradient)	r+   r<   r=   r>   r?   rJ   r@   rE   rM   s	            r,   rK   zArrayAPILossMixin.loss_gradientA  sO    H !!)' " 

 )))' * 

 X~r.   c                 *    | j                  |||      S )ax  Compute gradient of loss w.r.t raw_prediction for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        gradient_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            Ignored by the array API implementation.
        n_threads : int, default=1
            Ignored by the array API implementation.

        Returns
        -------
        gradient : array of shape (n_samples,) or (n_samples, n_classes)
            Element-wise gradients.
        r  )r  rN   s         r,   rM   zArrayAPILossMixin.gradientq  s%    < %%)' & 
 	
r.   ry   rw   rx   )r{   r|   r}   r~   rV   rE   rK   rM   r   r.   r,   r   r     sK    " KF !
N .h "
r.   r   c                 N   | j                   |j                  k(  rg dng d}|j                  | |d   k  ||j                  | |d   k  |j                  |      |j                  | |d   k  |j	                  d|z         |j                  | |d   k  | d|z  z   |                         S )an  Numerically stable version of log(1 + exp(x)) that is compatible with
    the array API.

    Parameters
    ----------
    raw_prediction : C-contiguous array of shape (n_samples,) or array of         shape (n_samples, n_classes)
        Raw prediction values (in link space).
    raw_prediction_exp : C-contiguous array of shape (n_samples,) or array of         shape (n_samples, n_classes)
        Exponential of the raw prediction values.
    xp : module, default=None
        Array namespace module.

    Returns
    -------
    log1pexp : float
        Numerically stable value for log(1 + exp(raw_prediction)).
    )r      gfffff@@)r   	   g333333-@r   r:   r9   g      ?r   )rI   rn   wherelog1pr   )r=   raw_prediction_expr#   	constantss       r,   	_log1pexpr    s    j 2::- 	 
 88)A,&
il*HH'(HH)A,.s//0"il2"Q);%;;"	
 r.   c                   8    e Zd ZdZ	 	 	 	 ddZ	 	 ddZ	 	 ddZy)HalfBinomialLossArrayAPIzHA version of the HalfBinomialLoss that is compatible with the array API.Nc                     | j                   j                  |      }| j                  ||||      }| j                  ||||      }	||	fS N)r<   r=   r>   r  r#   expr  r  
r+   r<   r=   r>   r?   rJ   r@   r  rE   rM   s
             r,   rK   z&HalfBinomialLossArrayAPI.loss_gradient  g     "WW[[8!!)'1	 " 
 )))'1	 * 
 X~r.   c                     || j                   j                  |      }t        ||| j                         }|||z  z
  }|||z  }|S )N)r=   r  r#   )r#   r  r  )r+   r<   r=   r>   r  log1pexprE   s          r,   r  z&HalfBinomialLossArrayAPI._compute_loss  sZ     %!%^!<)1ww

 &>11$M!Dr.   c                     | j                   }||j                  |      }d|z  }|j                  ||j                  |j                  k(  rdndkD  d|z
  ||z  z
  d|z   z  ||z
        }|||z  }|S )Nr:   r
  r  )r#   r  r  rI   rn   )r+   r<   r=   r>   r  r#   neg_raw_prediction_expgrads           r,   r  z*HalfBinomialLossArrayAPI._compute_gradient  s     WW%!#!7!"%7!7xx^%9%9RZZ%GcSQ&jF%;;;))+'	
 $M!Dr.   rx   NNr{   r|   r}   r~   rK   r  r  r   r.   r,   r  r    s2    R 8 . r.   r  c                   8     e Zd ZdZd fd	Z	 ddZ	 ddZ xZS )HalfMultinomialLossArrayAPIa  A version of the HalfMultinomialLoss that is compatible with the array API.

    Parameters
    ----------
    sample_weight : {None, ndarray}
        If sample_weight is None, the hessian might be constant.

    n_classes : {None, int}
        The number of classes for classification, else None.

    xp : module or None
        Array namespace module.

    device : device or None
        A device object.
    c                 T    t         |   |||       d | _        d | _        d | _        y )N)r"   r#   r$   )r   r-   r   r   r   r   s        r,   r-   z$HalfMultinomialLossArrayAPI.__init__7  s2    9FC '+# #r.   c                    | j                   }| j                  }t        |d|      }| j                  #|j	                  ||j
                  |      | _        | j                  2|j                  |j                  d   |      | j                  z  | _        |j                  t        |      | j                  | j                  z         }||z
  }|||z  }|S )Nr:   )rY   r#   rI   r$   r   )r$   )r#   r$   r   r   asarrayint64r   arangerC   r"   taker   )	r+   r<   r=   r>   r#   r$   log_sum_exptrue_label_probsrE   s	            r,   r  z)HalfMultinomialLossArrayAPI._compute_lossD  s     WW aB???" jjrxxjODO&&.		&,,q/&	9DNNJ ' 77>"DOOd6Q6Q$Q
 --$M!Dr.   c                 ^   | j                   }| j                  }| j                  `| j                  #|j	                  ||j
                  |      | _        t        | j                  | j                  |j                        | _        t        |      }|| j                  z  }|||d d d f   z  }|S )Nr&  )num_classesrI   )
r#   r$   r   r   r'  r(  r   r"   rI   r   )r+   r<   r=   r>   r#   device_r  s          r,   r  z-HalfMultinomialLossArrayAPI._compute_gradient\  s     WW++&&"$**V288G*"T") NN$**#D
 ~& 	###$M!T'**Dr.   r   rz   )r{   r|   r}   r~   r-   r  r  r   r   s   @r,   r#  r#  %  s!    "#" 	8 	r.   r#  c                   8    e Zd ZdZ	 	 	 	 ddZ	 	 ddZ	 	 ddZy)HalfPoissonLossArrayAPIzGA version of the HalfPoissonLoss that is compatible with the array API.Nc                     | j                   j                  |      }| j                  ||||      }| j                  ||||      }	||	fS r  r  r  s
             r,   rK   z%HalfPoissonLossArrayAPI.loss_gradient  r  r.   c                 ^    || j                   j                  |      }|||z  z
  }|||z  }|S rz   r#   r  )r+   r<   r=   r>   r  rE   s         r,   r  z%HalfPoissonLossArrayAPI._compute_loss  sA     %!%^!<!F^$;;$M!Dr.   c                 X    || j                   j                  |      }||z
  }|||z  }|S rz   r4  )r+   r<   r=   r>   r  r  s         r,   r  z)HalfPoissonLossArrayAPI._compute_gradient  s<     %!%^!<!F*$M!Dr.   rx   r   r!  r   r.   r,   r1  r1  }  s2    Q 8 $ r.   r1  )7r~   r   numpyr'   scipy.specialr   sklearn._loss._lossr   r   r   r   r	   r
   r   r   r   r   r   sklearn._loss.linkr   r   r   r   r   r   !sklearn.externals.array_api_extrar   sklearn.utilsr   sklearn.utils._array_apir   r   r   sklearn.utils.extmathr   sklearn.utils.statsr   r   r   r   r   r   r   r   r   r   r   r   r   _LOSSESr   r  r  r#  r1  r   r.   r,   <module>r@     se  *        6 & 
 * 4*T! T!n6x 62$CH $CN=( =@I@ I@Xh DH >?h ?D-Eh -E`Dx DNk'( k'\Hh HX &###%+'b
 b
JHVA02B AHU"35H Up5/ 5r.   