
    yj8                    R   d Z ddlmZ ddlZddlmZ ddlZddlm	Z	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ  ej        e          Ze G d d                      Ze G d d                      Zd.dZd/dZd0dZd0d Z d0d!Z!d1d%Z"d2d(Z#d3d*Z$d4d,Z%d4d-Z&dS )5z
Surprisal-based observation sampling for dream processing.

Computes geometric surprisal scores for observations using tree-based
data structures, enabling targeted deductive reasoning on anomalous
or novel observations.
    )annotationsN)	dataclass)funcselect)AsyncSession)models)settings)get_all_documents)
tracked_db)SurprisalTreecreate_treec                  <    e Zd ZU dZded<   ded<   ded<   ded<   d	S )
ObservationDataz?Plain data extracted from a Document ORM object (session-safe).stridcontentz
str | Nonelevelznp.ndarray | list[float]	embeddingN__name__
__module____qualname____doc____annotations__     =/DATA/AppData/hermes/projects/honcho/src/dreamer/surprisal.pyr   r      sB         IIGGGLLL''''''r   r   c                  2    e Zd ZU dZded<   ded<   ded<   dS )	SurprisalScorez/Container for observation with surprisal score.r   observationfloat	surprisal
np.ndarrayr   Nr   r   r   r   r   r   %   s<         99    r   r   workspace_namer   observerobservedreturnlist[SurprisalScore]c                  K   	 t          d          4 d{V }t          || ||           d{V }d |D             }t          |          t          |          z
  }|rt                              d| d           ddd          d{V  n# 1 d{V swxY w Y   |s%t                              d|  d| d|            g S t
          j        j        j        d	z  }t          |          |k     r0t                              d
t          |           d| d           g S t          |          }|j
        dk    rt                              d           g S t          |          }	t          |||	          }
d |
D             }t          |          t          |
          k     r;t                              dt          |
          t          |          z
   d           t          |          }|                    d d           t!          dt          |                    }t
          j        j        j        dz  }t                              d|dd           t                              d| d           t'          |d|         d          D ]g\  }}|j        j        }t          |          dk    r|dd         dz   }t                              d | d!|j        d"d#|j        j         d$|            ht1          |          }t                              d%t          |           dt          |           d&|dd'           |rot                              d(d)|d*         j        d"d+z   d,|d         j        d"d+z   d-t3          d. |D                       t          |          z  d"z              nt                              d/           |S # t4          $ r+}t                              d0| d1           g cY d}~S d}~ww xY w)2ag  
    Sample observations and compute surprisal scores.

    Workflow:
    1. Fetch observations based on SAMPLING_STRATEGY (short DB scope)
    2. Extract embeddings (already stored on objects, no DB needed)
    3. Build tree structure using trees.create_tree()
    4. Compute surprisal for each observation
    5. Rank by surprisal (highest first)
    6. Filter by threshold and take top N

    Args:
        workspace_name: Workspace identifier
        observer: Observer peer name
        observed: Observed peer name

    Returns:
        List of SurprisalScore objects, ranked by surprisal (highest first)
    zdream.surprisal.fetchN)dbr$   r%   r&   c                j    g | ]0}|j         	t          |j        |j        |j        |j                   1S )N)r   r   r   r   )r   r   r   r   r   .0obss     r   
<listcomp>z6sample_observations_with_surprisal.<locals>.<listcomp>Q   sQ     	 	 	 =,  vK)!m	   -,,r   zSkipped z" observations with null embeddingszNo observations found for /   zToo few observations (z < z!), skipping surprisal computationr   zFailed to extract embeddingsc                v    g | ]6}t          j        |j                  t          j        |j                  4|7S r   )npisinfr"   isnanr-   ss     r   r/   z6sample_observations_with_surprisal.<locals>.<listcomp>{   sM     
 
 
RXak%:%:
CE8AKCXCX

 
 
r   z	Filtered z invalid surprisal scoresc                    | j         S Nr"   )xs    r   <lambda>z4sample_observations_with_surprisal.<locals>.<lambda>   s    Q[ r   T)keyreverse   d   u0   🎯 Surprisal computation complete. Taking top z.0f%zTop z, observations by normalized surprisal score:   P   M   z...z  #z [surprisal=z.3fz	] [level=z] z
Selected: z observations (top z%)u   📊 Filtered statistics: zmin=z, zmax=zmean=c              3  $   K   | ]}|j         V  d S r9   r:   r6   s     r   	<genexpr>z5sample_observations_with_surprisal.<locals>.<genexpr>   s$      <<aak<<<<<<r   z0No observations exceeded the surprisal thresholdzSurprisal sampling failed: )exc_info)r   _fetch_observationslenloggerwarningr	   DREAM	SURPRISALTREE_K_extract_embeddingssizeerror_build_tree_compute_surprisal_scores_normalize_scoressortminTOP_PERCENT_SURPRISALinfo	enumerater    r   r"   r   _filter_by_percentsum	Exception)r$   r%   r&   r*   raw_observationsobservationsskippedmin_observations
embeddingstreescoresvalid_scoresnormalized_scorestop_npercentiscorer   filteredes                       r   "sample_observations_with_surprisalrm   .   s     0f566 	W 	W 	W 	W 	W 	W 	W"%8-!!	& & &            	 	 ,	 	 	L *++c,.?.??G WU'UUUVVV+	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W0  	NNS^SShSSSS   I $>3:Q>|///NNr\):):rr?Orrr   I )66
?aLL7888I :&& +<TJJ
 

 
 
 |s6{{**NNVCKK#l*;*;;VVV  
 .l;; 	#8#8$GGG As,--...*@3FUwUUUUVVVN5NNNOOO!"3FUF";Q?? 	 	HAu'/G7||b  !#2#,.KKgaggU_ggg5CTCZgg^egg    &&788^X^^\):):^^w^^^^	
 	
 	

  	LKK,7"/777786!.66667 S#<<8<<<<<s8}}LRRRS    KKJKKK   6166FFF						sU   O A%B>O 
BO B)O AO $5O I;O 
P  P PPr*   r   list[models.Document]c                  K   t           j        j        j        }t           j        j        j        }t           j        j        j        }|dk    rt          | |||||           d{V S |dk    rt          | |||||           d{V S |dk    rt          | |||||           d{V S t          
                    d| d           t          | |||||           d{V S )a  
    Fetch observations based on configured sampling strategy.

    Args:
        db: Database session
        workspace_name: Workspace identifier
        observer: Observer peer name
        observed: Observed peer name

    Returns:
        List of Document objects
    recent)r*   r$   r%   r&   limitlevelsNrandomallzUnknown sampling strategy: z, using 'recent')r	   rM   rN   SAMPLING_STRATEGYSAMPLE_SIZEINCLUDE_LEVELS_fetch_recent_observations_fetch_random_observations_fetch_all_observationsrK   rL   )r*   r$   r%   r&   strategysample_sizerr   s          r   rI   rI      s     $ ~'9H.*6K^%4F8/)
 
 
 
 
 
 
 
 
 	
 
X		/)
 
 
 
 
 
 
 
 
 	
 
U		,)
 
 
 
 
 
 
 
 
 	
 	OXOOOPPP/)
 
 
 
 
 
 
 
 
 	
r   rq   intrr   	list[str]c                   K   t          ||||rdd|iind|          }|                     |           d{V }t          |                                                                          S )a  
    Fetch most recent observations.

    Uses existing get_all_documents() query with level filtering.

    Args:
        db: Database session
        workspace_name: Workspace identifier
        observer: Observer peer name
        observed: Observed peer name
        limit: Maximum number of observations to fetch
        levels: Document levels to include

    Returns:
        List of Document objects ordered by created_at DESC
    r   inN)r$   r%   r&   filtersrq   )r
   executelistscalarsrt   r*   r$   r%   r&   rq   rr   stmtresults           r   rx   rx      s      0 %-3=4.))  D ::d########F  $$&&'''r   c                T  K   t          t          j                                      t          j        j        |k    t          j        j        |k    t          j        j        |k              }|r7|                    t          j        j                            |                    }|	                    t          j                                                  |          }|                     |           d{V }t          |                                                                          S )a  
    Fetch random sample of observations.

    Uses PostgreSQL's random() function for efficient random sampling.

    Args:
        db: Database session
        workspace_name: Workspace identifier
        observer: Observer peer name
        observed: Observed peer name
        limit: Maximum number of observations to fetch
        levels: Document levels to include

    Returns:
        List of Document objects in random order
    N)r   r   Documentwherer$   r%   r&   r   in_order_byr   rs   rq   r   r   r   rt   r   s           r   ry   ry     s      0 &/""((&.8 H, H, D  =zz&//33F;;<< ==''--e44D::d########F  $$&&'''r   c                p  K   t          t          j                                      t          j        j        |k    t          j        j        |k    t          j        j        |k                                  t          j        j        	                                          
                    |          }|r7|                    t          j        j                            |                    }|                     |           d{V }t          |                                                                          S )a  
    Fetch all observations up to limit.

    Orders by created_at DESC for consistency.

    Args:
        db: Database session
        workspace_name: Workspace identifier
        observer: Observer peer name
        observed: Observed peer name
        limit: Maximum number of observations to fetch
        levels: Document levels to include

    Returns:
        List of Document objects ordered by created_at DESC
    N)r   r   r   r   r$   r%   r&   r   
created_atdescrq   r   r   r   r   r   rt   r   s           r   rz   rz   8  s      2 	v	O*n<O$0O$0

 


 
&/,1133	4	4	u 	  =zz&//33F;;<<::d########F  $$&&'''r   r_   list[ObservationData]r#   c                    | st          j        g           S d | D             }t          j        |t           j                  }|S )z
    Extract embeddings from observations as numpy array.

    Args:
        observations: List of Document objects with embeddings

    Returns:
        np.ndarray of shape (N, 1536) containing embeddings
    c                    g | ]	}|j         
S r   )r   r,   s     r   r/   z'_extract_embeddings.<locals>.<listcomp>o  s    ===s}===r   )dtype)r3   arrayfloat32)r_   embeddings_listembeddings_arrays      r   rP   rP   b  sJ      x||=====OxrzBBBr   rb   r   c                    | j         dk    r#t          t          j        j        j                  S t          t          j        j        j        t          j        j        j                  }|                    |            |S )z
    Build tree structure from embeddings.

    Args:
        embeddings: np.ndarray of shape (N, embedding_dim)

    Returns:
        SurprisalTree configured per settings
    r   )	tree_typek)rQ   r   r	   rM   rN   	TREE_TYPErO   batch_insert)rb   rc   s     r   rS   rS   u  sl     !8>3=>>>.*4
.
"
)  D
 	j!!!Kr   rc   c                    g }t          | |d          D ]?\  }}|                    |          }|                    t          |||                     @|S )a)  
    Compute surprisal score for each observation.

    Args:
        observations: List of ObservationData objects
        embeddings: np.ndarray of embeddings matching observations
        tree: Built SurprisalTree

    Returns:
        List of SurprisalScore objects (unfiltered, unsorted)
    F)strictr    r"   r   )zipr"   appendr   )r_   rb   rc   rd   r.   r   r"   s          r   rT   rT     sy      $&FlJuEEE 	
 	
YNN9--	##  	
 	
 	
 	
 Mr   rd   c                   | sg S d | D             }t          |          }t          |          }||k    rd | D             S g }| D ]A}|j        |z
  ||z
  z  }|                    t	          |j        ||j                             B|S )z
    Normalize surprisal scores to [0, 1] range using min-max normalization.

    Args:
        scores: List of SurprisalScore objects with raw surprisal values

    Returns:
        List of SurprisalScore objects with normalized surprisal values
    c                    g | ]	}|j         
S r   r:   r6   s     r   r/   z%_normalize_scores.<locals>.<listcomp>  s    444444r   c                F    g | ]}t          |j        d |j                  S )g      ?r   )r   r    r   r6   s     r   r/   z%_normalize_scores.<locals>.<listcomp>  sD     
 
 
  MSAK  
 
 
r   r   )rW   maxr"   r   r   r    r   )rd   surprisal_valuesmin_surprisalmax_surprisal
normalizedrj   normalized_values          r   rU   rU     s      	 54V444())M())M%%
 
 	
 
 
 	
 (*J 

 

!Om;M)
 	!-*/  	
 	
 	
 	
 r   c                    | sg S t           j        j        j        }t	          dt          t          |           |z                      }| d|         S )a  
    Filter observations by top percentage.

    Assumes scores are already sorted by surprisal (highest first).

    Args:
        scores: List of SurprisalScore objects, sorted by surprisal DESC

    Returns:
        Filtered list of SurprisalScore objects (top N% by surprisal)
    rB   N)r	   rM   rN   rX   r   r}   rJ   )rd   top_percentcounts      r   r[   r[     sP      	 .*@K3s6{{[01122E&5&>r   )r$   r   r%   r   r&   r   r'   r(   )
r*   r   r$   r   r%   r   r&   r   r'   rn   )r*   r   r$   r   r%   r   r&   r   rq   r}   rr   r~   r'   rn   )r_   r   r'   r#   )rb   r#   r'   r   )r_   r   rb   r#   rc   r   r'   r(   )rd   r(   r'   r(   )'r   
__future__r   loggingdataclassesr   numpyr3   
sqlalchemyr   r   sqlalchemy.ext.asyncior   srcr   
src.configr	   src.crud.documentr
   src.dependenciesr   src.dreamer.treesr   r   	getLoggerr   rK   r   r   rm   rI   rx   ry   rz   rP   rS   rT   rU   r[   r   r   r   <module>r      s+    # " " " " "  ! ! ! ! ! !     # # # # # # # # / / / / / /             / / / / / / ' ' ' ' ' ' 8 8 8 8 8 8 8 8		8	$	$ ( ( ( ( ( ( ( (        ~ ~ ~ ~B:
 :
 :
 :
z!( !( !( !(H%( %( %( %(P'( '( '( '(T   &   0   @) ) ) )X     r   