
    Q3ja                     f    d Z ddlZddlmZ ddlmZmZmZ ddl	m
Z
 ddlmZmZ d Z G d d	      Zy)
zA
Loss functions for linear models with raw_prediction = X @ coef
    N)sparse)get_namespaceget_namespace_and_devicemove_to)_align_api_if_sparse)safe_sparse_dotsquared_normc           
          | j                   d   }t        j                  |       r=t        t	        | j
                  t        j                  |df||f      | z  d            S |dddf   | z  }| j
                  |z  S )z/Compute the sandwich product X.T @ diag(W) @ X.r   shapeT)dense_outputN)r   r   issparser   r   T	dia_array)XW	n_samplesWXs       N/DATA/.local/lib/python3.12/site-packages/sklearn/linear_model/_linear_loss.pysandwich_dotr      sz     
Iq#  !Q	9/EFJ!
 	
 q$wZ!^ssRx    c                   ~    e Zd ZdZd ZddZd Zd Zd Z	 	 	 	 ddZ		 	 	 	 dd	Z
	 	 	 	 dd
Z	 	 	 	 	 	 ddZ	 ddZy)LinearModelLossa>	  General class for loss functions with raw_prediction = X @ coef + intercept.

    Note that raw_prediction is also known as linear predictor.

    The loss is the average of per sample losses and includes a term for L2
    regularization::

        loss = 1 / s_sum * sum_i s_i loss(y_i, X_i @ coef + intercept)
               + 1/2 * l2_reg_strength * ||coef||_2^2

    with sample weights s_i=1 if sample_weight=None and s_sum=sum_i s_i.

    Gradient and hessian, for simplicity without intercept, are::

        gradient = 1 / s_sum * X.T @ loss.gradient + l2_reg_strength * coef
        hessian = 1 / s_sum * X.T @ diag(loss.hessian) @ X
                  + l2_reg_strength * identity

    Conventions:
        if fit_intercept:
            n_dof =  n_features + 1
        else:
            n_dof = n_features

        if base_loss.is_multiclass:
            coef.shape = (n_classes, n_dof) or ravelled (n_classes * n_dof,)
        else:
            coef.shape = (n_dof,)

        The intercept term is at the end of the coef array:
        if base_loss.is_multiclass:
            if coef.shape (n_classes, n_dof):
                intercept = coef[:, -1]
            if coef.shape (n_classes * n_dof,)
                intercept = coef[n_classes * n_features:] = coef[(n_dof-1):]
            intercept.shape = (n_classes,)
        else:
            intercept = coef[-1]

        Shape of gradient follows shape of coef.
        gradient.shape = coef.shape

        But hessian (to make our lives simpler) are always 2-d:
        if base_loss.is_multiclass:
            hessian.shape = (n_classes * n_dof, n_classes * n_dof)
        else:
            hessian.shape = (n_dof, n_dof)

    Note: if coef has shape (n_classes * n_dof,), the classes are expected to be
    contiguous, i.e. the 2d-array can be reconstructed as

        coef.reshape((n_classes, -1), order="F")

    The option order="F" makes coef[:, i] contiguous. This, in turn, makes the
    coefficients without intercept, coef[:, :-1], contiguous and speeds up
    matrix-vector computations.

    Note: If the average loss per sample is wanted instead of the sum of the loss per
    sample, one can simply use a rescaled sample_weight such that
    sum(sample_weight) = 1.

    Parameters
    ----------
    base_loss : instance of class BaseLoss from sklearn._loss.
    fit_intercept : bool
    c                      || _         || _        y N)	base_lossfit_intercept)selfr   r   s      r   __init__zLinearModelLoss.__init__s   s    "*r   Nc                 
   |j                   d   }| j                  j                  }| j                  r|dz   }n|}| j                  j                  rt        j                  ||f|d      }|S t        j                  ||      }|S )a  Allocate coef of correct shape with zeros.

        Parameters:
        -----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        dtype : data-type, default=None
            Overrides the data type of coef. With dtype=None, coef will have the same
            dtype as X.

        Returns
        -------
        coef : ndarray of shape (n_dof,) or (n_classes, n_dof)
            Coefficients of a linear model.
           F)r   dtypeorder)r   r#   )r   r   	n_classesr   is_multiclassnpzeros)r   r   r#   
n_featuresr%   n_dofcoefs          r   init_zero_coefzLinearModelLoss.init_zero_coefw   sy      WWQZ
NN,,	NEE>>''889e"4EMD  88%u5Dr   c                 <   | j                   j                  s"| j                  r|d   }|dd }||fS d}|}||fS |j                  dk(  r*|j	                  | j                   j
                  dfd      }n|}| j                  r|dddf   }|ddddf   }||fS d}||fS )a  Helper function to get coefficients and intercept.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").

        Returns
        -------
        weights : ndarray of shape (n_features,) or (n_classes, n_features)
            Coefficients without intercept term.
        intercept : float or ndarray of shape (n_classes,)
            Intercept terms.
        N        r!   r"   r$   )r   r&   r   ndimreshaper%   )r   r+   	interceptweightss       r   weight_interceptz LinearModelLoss.weight_intercept   s    $ ~~++!! H	s)  	!!  	 	!! yyA~,,(@(@"'ES,Q!!#ArEN	!!SbS&/ 	!!  		!!r   c                 ,   | j                  |      \  }}t        |      \  }}}|j                  ||j                  |      }|j                  ||j                  |      }	| j                  j
                  s	||z  |	z   }
n||j                  z  |	z   }
|||
fS )ai  Helper function to get coefficients, intercept and raw_prediction.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.

        Returns
        -------
        weights : ndarray of shape (n_features,) or (n_classes, n_features)
            Coefficients without intercept term.
        intercept : float or ndarray of shape (n_classes,)
            Intercept terms.
        raw_prediction : ndarray of shape (n_samples,) or             (n_samples, n_classes)
        )r#   device)r5   r   asarrayr#   r   r&   r   )r   r+   r   r4   r3   xp_device_
weights_xpintercept_xpraw_predictions              r   weight_intercept_rawz$LinearModelLoss.weight_intercept_raw   s    , "22481!4Aw ZZqwwwZG
zz)1777zK~~++^l:N -<N	>11r   c                 b    |j                   dk(  r||z  n
t        |      }t        d|z  |z        S )z5Compute L2 penalty term l2_reg_strength/2 *||w||_2^2.r!   g      ?)r1   r	   float)r   r4   l2_reg_strengthnorm2_ws       r   
l2_penaltyzLinearModelLoss.l2_penalty   s3    '.||q'8'G#l7>SS?*W455r   c                 t   |j                   d   }|| j                  ||      \  }	}
}n| j                  |      \  }	}
| j                  j	                  ||||      }t        |||      \  }}||n|j                  |      }t        |j                  |      |z        }|dkD  r|| j                  |	|      z  }|S )a  Compute the loss as weighted average over point-wise losses.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        loss : float
            Weighted average of losses per sample, plus penalty.
        r   y_truer>   sample_weight	n_threads)	r   r?   r5   r   lossr   sumrA   rD   )r   r+   r   yrH   rB   rI   r>   r   r4   r3   rJ   r9   r:   sw_sums                  r   rJ   zLinearModelLoss.loss   s    N GGAJ	!151J1J4QR1S.GY!%!6!6t!<GY~~"")'	 # 
 aM2A+39NRVVD\F*+QDOOG_==Dr   c                 <   |j                   | j                  j                  c\  }}	}
|	t        | j                        z   }|| j                  ||      \  }}}n| j                  |      \  }}| j                  j                  ||||      \  }}t        |||      \  }}||n|j                  |      }t        |j                  |      |z        }|| j                  ||      z  }||z  }| j                  j                  sot        j                  ||j                        }|j                   |z  }t#        |t        d      ||z  z   |d|	 | j                  r|j                  |      |d<   ||fS t        j$                  |
|f|j                  d      }|j                   |z  }t#        |t        d      ||z  z   |ddd|	f<   | j                  r*t#        |j                  |d	
      t        d      |dddf<   |j&                  dk(  r|j)                  d      }||fS )a\  Computes the sum of loss and gradient w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        loss : float
            Weighted average of losses per sample, plus penalty.

        gradient : ndarray of shape coef.shape
             The gradient of the loss.
        NrF   r#   cpu)r9   r7   r.   r"   r#   r$   r   axisr!   r0   )r   r   r%   intr   r?   r5   loss_gradientr   rK   rA   rD   r&   r'   
empty_liker#   r   r   emptyr1   ravel)r   r+   r   rL   rH   rB   rI   r>   r   r)   r%   r*   r4   r3   rJ   grad_pointwiser9   r:   rM   gradX_gradgrad_Xs                         r   rU   zLinearModelLoss.loss_gradient#  s   T ./WWdnn6N6N*JS!3!344!151J1J4QR1S.GY!%!6!6t!<GY#~~;;)'	  <  
n aM2A+39NRVVD\F*+99& ~~++==W]];DSS>)F2e47PP * !!66.1R$ Tz 88Y.gmm3OD#%%)F2e47PP KZK  !!%FF>F2r%QU yyA~zzz,Tzr   c                 F   |j                   | j                  j                  c\  }}	}
|	t        | j                        z   }|| j                  ||      \  }}}n| j                  |      \  }}| j                  j                  ||||      }||nt        j                  |      }||z  }| j                  j                  sZt        j                  ||j                        }|j                  |z  ||z  z   |d|	 | j                  r|j                         |d<   |S t        j                  |
|f|j                  d      }|j                  |z  ||z  z   |ddd|	f<   | j                  r|j                  d      |dddf<   |j                  d	k(  r|j!                  d
      S |S )a  Computes the gradient w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        gradient : ndarray of shape coef.shape
             The gradient of the loss.
        NrF   rO   r.   r"   rQ   r   rR   r!   r0   )r   r   r%   rT   r   r?   r5   gradientr'   rK   r&   rV   r#   r   rW   r1   rX   )r   r+   r   rL   rH   rB   rI   r>   r   r)   r%   r*   r4   r3   rY   rM   rZ   s                    r   r^   zLinearModelLoss.gradient}  s   N ./WWdnn6N6N*JS!3!344!151J1J4QR1S.GY!%!6!6t!<GY00)'	 1 
 ,39N& ~~++==W]];D !n 47P PD*!!)--/RK88Y.gmm3OD#1#3#3a#7/G:S#SDKZK !!,00a08QUyyA~zzz,,r   c
                 $   |j                   | j                  j                  c\  }
}}|t        | j                        z   }|	| j                  ||      \  }}}	n| j                  |      \  }}||
nt        j                  |      }|#t        j                  ||j                  d      }nx|j                   |j                   k7  r&t        d|j                    d|j                    d      | j                  j                  r!|j                  j                  st        d      |}|j                  }|$t        j                   ||f|j                        }n~|j                   ||fk7  rt        d	||f d
|j                   d      | j                  j                  r7|j                  j"                  s!|j                  j                  st        d      |}| j                  j                  s:| j                  j%                  ||	||      \  }}||z  }||z  }t        j&                  |dk  |      dkD  }t        j(                  |      }|j*                  |z  ||z  z   |d| | j                  r|j                         |d<   |r|||fS t-        ||      |d|d|f<   |dkD  rA|j                  j"                  rdnd}|j/                  d|      d||z  |dz   xx   |z  cc<   | j                  r|j*                  |z  }||dddf<   ||dddf<   |j                         |d<   nj| j                  j1                  ||	||      \  }}||z  }|j/                  ||fd      }|j*                  |z  ||z  z   |ddd|f<   | j                  r|j                  d      |dddf<   |j2                  dk(  r|j5                  d      }|||z  }nd|z  }t7        |      D ]]  }|dd|f   d|dd|f   z
  z  |z  }t-        ||      ||||z  ||||z  |f<   | j                  rV|j*                  |z  }|||||z  |||z  |z   f<   ||||z  |z   |||z  |f<   |j                         |||z  |z   ||z  |z   f<   t7        |dz   |      D ]  }|dd|f    |dd|f   z  |z  }t-        ||      ||||z  ||||z  |f<   | j                  rV|j*                  |z  }|||||z  |||z  |z   f<   ||||z  |z   |||z  |f<   |j                         |||z  |z   ||z  |z   f<   ||d||d|f   ||d||d|f<    ` |dkD  rJ|j                  j"                  rdnd}|j/                  d|      d|dz  |z  |z  ||z  dz   xx   |z  cc<   d}|||fS )a~  Computes gradient and hessian w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        gradient_out : None or ndarray of shape coef.shape
            A location into which the gradient is stored. If None, a new array
            might be created.
        hessian_out : None or ndarray of shape (n_dof, n_dof) or             (n_classes * n_dof, n_classes * n_dof)
            A location into which the hessian is stored. If None, a new array
            might be created.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        gradient : ndarray of shape coef.shape
             The gradient of the loss.

        hessian : ndarray of shape (n_dof, n_dof) or             (n_classes, n_dof, n_dof, n_classes)
            Hessian matrix.

        hessian_warning : bool
            True if pointwise hessian has more than 25% of its elements non-positive.
        Nr"   rQ   z4gradient_out is required to have shape coef.shape = z; got .z"gradient_out must be F-contiguous.rO   z'hessian_out is required to have shape (z); got hessian_out.shape=zhessian_out must be contiguous.rF   r   )r4   g      ?r.   Cr0   r!   )r.   r.   rR   g      ?   F)r   r   r%   rT   r   r?   r5   r'   rK   rV   r#   
ValueErrorr&   flagsf_contiguoussizerW   c_contiguousgradient_hessianaverageabsr   r   r2   gradient_probar1   rX   range)r   r+   r   rL   rH   rB   rI   gradient_outhessian_outr>   r   r)   r%   r*   r4   r3   rM   rZ   nhessrY   hess_pointwisehessian_warningr$   Xhprobaswkhls                                 r   rh   z LinearModelLoss.gradient_hessian  s   n ./WWdnn6N6N*JS!3!344!151J1J4QR1S.GY!%!6!6t!<GY+39N ==W]]#FD4::-Ftzzl S#))*!-  ^^)),2D2D2Q2QABBDII88QF'--8D1a&(9!Q$ @&$$&a)  ^^))!!..{7H7H7U7U>??D~~++-1^^-L-L-+#	 .M .*NN f$Nf$N 

>Q.FM   VVN3N !n 47P PD*!!)--/RT?22-9!^-LD*kzk)*"  $zz66CRu-.Re1CPQ	.RS#S !! SS>) "SbS"W "R"W-113V %)NN$A$A-+#	 %B %!NE f$N<<E 2#<>D#1#3#3a#7/G:S#SDKZK !!,00a08QUyyA~zzz,L ("V+6\9% !Q$K1uQT{?3b8 !A& 	J.:	J.:< %%qB  I
2Y>!J.24  !J.2I
2Y>@
  Z/!3Y5Ka5OOP q1ui0Aq!tuQT{2R7A %Q* I
2Y>I
2Y>@ ))SS1W  	J 6B%
2Q68  %
2Q6	J 6BD
 EEG Y3a7Z9ORS9SST 8<ALyL!,Y,<V7WDIq|)|34+ 1/ &\ "#zz66CRu-Sy!|j058Y=NQR=RS$% 
 $OT?**r   c                 B    j                    j                  j                  c\  }t         j                        z    j                        \  }}	|nt        j                         j                  j                  sJ j                  j                  ||	|      \  }
}|
z  }
|z  }t        j                  j                        }j                  |
z  z  z   |d  j                  r|
j                         |d<   |j                         t        j                        rt        j                  |df||f      z  n|ddt        j                   f   z   j                  rMt        j"                  t        j$                  j                  d                  t        j&                         fd}||fS  j                  j)                  ||	|      \  }
|
z  }
t        j*                  fj                  d	
      }|
j                  z  z  z   |dddf<    j                  r|
j                  d      |dddf<    fd}j,                  dk(  r|j/                  d	      |fS ||fS )a  Computes gradient and hessp (hessian product function) w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.

        Returns
        -------
        gradient : ndarray of shape coef.shape
             The gradient of the loss.

        hessp : callable
            Function that takes in a vector input of shape of gradient and
            and returns matrix-vector product with hessian.
        NrF   rO   r.   r   r   rR   c                    t        j                  |       }t        j                        rj                  | d  z  z  |d  n2t         j
                  j                  j                  | d  g      |d  |d xxx | d  z  z  ccc j                  r(|d xxx | d   z  z  ccc | d  z  | d   z  z   |d<   |S )Nr.   )r'   rV   r   r   r   linalg	multi_dotr   )	sretr   hXhX_sumhessian_sumrB   r)   r   s	     r   hesspz7LinearModelLoss.gradient_hessian_product.<locals>.hessp  s    mmA&??1%'(ssb1[j>.A'BC$')yy':':ACCQ{
^;T'UC$KZ Oan$DD %%$"6$$q*~5ae8KKCG
r   r"   rQ   c                 T   | j                  dfd      } j                  r| d d df   }| d d d df   } nd}| j                  z  |z   }|
|z  j                  d      d d t        j
                  f   z  }|
z  }|d d t        j
                  f   z  }t	        j                  fj                  d      }|j                  z  z  | z  z   |d d d 	f<   j                  r|j                  d      z  |d d df<   j                  dk(  r|j                  d      S |S )Nr.   r"   r0   r   r!   rR   rQ   )
r2   r   r   rK   r'   newaxisrW   r#   r1   rX   )r}   s_intercepttmp	hess_prodr   r+   rB   r%   r*   r)   rt   rH   r   rM   r4   s       r   r   z7LinearModelLoss.gradient_hessian_product.<locals>.hesspD  s6   IIy"oSI9%%"#ArE(K!SbS&	A"#K!##g+((a(0BJJ??u ,=BJJ77C HHi%7w}}TWX	-0UUQY&,@?UVCV,V	![j[.)%%'*wwAw'?Iae$99>$???55$$r   r!   r0   )r   r   r%   rT   r   r?   r'   rK   r&   rh   rV   r#   r   r   r   r   r   squeezer8   
atleast_1drk   rW   r1   rX   )r   r+   r   rL   rH   rB   rI   r   r3   r>   rY   rq   rZ   r   r   r   r   r%   r*   r)   rt   rM   r4   s   ``` ``        @@@@@@@@@r   gradient_hessian_productz(LinearModelLoss.gradient_hessian_product  s   @ ./WWdnn6N6N*JS!3!344-1-F-FtQ-O*N+39N~~++-1^^-L-L-+#	 .M .*NN f$Nf$N==W]];D !n 47P PD*!!)--/R ),,.Kq!$$na%8I@VW 
 $ArzzM2Q6!! BJJrvv1v~$>?v. \ U{w %)NN$A$A-+#	 %B %!NE f$N88Y.gmm3OD#1#3#3a#7/G:S#SDKZK !!,00a08QU.% %. yyA~zzz,e33U{r   r   )Nr/   r!   N)Nr/   r!   NNN)Nr/   r!   )__name__
__module____qualname____doc__r   r,   r5   r?   rD   rJ   rU   r^   rh   r    r   r   r   r   /   s    AF+8%"N&2P6 :B X~ G\ +D NOWr   r   )r   numpyr'   scipyr   sklearn.utils._array_apir   r   r   sklearn.utils._sparser   sklearn.utils.extmathr   r	   r   r   r   r   r   <module>r      s5      
 7 ?6o or   