
    yj#                     @   d dl Z d dlmZ d dlmZ d dlmZmZ d dlm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z'm(Z( d dl)m*Z* d dl+m,Z,m-Z-m.Z.  ee/          Z0ddddde1de1de1de2e1ef         dz  de3de4dz  dee5ej6                          fdZ7dddde1de2e1ef         dz  de3dee5ej6                          fd Z8d!dd"d#ede1de1de1de4d$e1dz  deej6                 fd%Z9d!d&d#ede1de1de1de4deej6                 fd'Z:de3fd(Z;	 	 	 dJde1de1de1d*e<e=         d+e4d,e=dz  de2e1ef         dz  de<e1         dz  fd-Z>	 dKd#ede1de1de1d.e<e1         de2e1ef         dz  de<ej6                 fd/Z?d#ede1de1de1d*e<e=         de2e1ef         dz  d,e=dz  d+e4de<ej6                 fd0Z@ddd)dd1d#edz  de1d2e1de1de1de2e1ef         dz  d,e=dz  d+e4d*e<e=         dz  deej6                 fd3ZAdd4d#ed5e<ejB                 de1de1de1d6e3de<ejB                 fd7ZCdd8d#ede1d9e1de1de1d$e1dz  ddfd:ZDdd8d#ede1d.ee1         de1de1d$e1dz  de<e5e1e1f                  fd;ZEd#ede1d9e1ddfd<ZFd#ed=eejG                 de1de<ej6                 fd>ZHd#ed?ejB        de1de1de1de3fd@ZI	 	 dLd#edBe-dCe4dDe4de4f
dEZJd#ede1d.e<e1         deej6                 fdFZKdddGd#ede1dHe1de1dz  de1dz  deej6                 fdIZLdS )M    N)Sequence)	getLogger)Anycast)deleteselectupdate)CursorResult)IntegrityError)AsyncSession)Select)func)modelsschemas)settings)get_or_create_collection)get_peer)get_session)
tracked_db)embedding_client)ResourceNotFoundExceptionValidationExceptionVectorStoreError)apply_filter)VectorRecordVectorStoreget_external_vector_storeF)filtersreverselimitworkspace_nameobserverobservedr   r   r    returnc                   t          t          j                                      t          j        j        | k                                  t          j        j        |k                                  t          j        j        |k                                  t          j        j                            d                    }t          |t          j        |          }|r7|
                    t          j        j                                                  }n6|
                    t          j        j                                                  }||                    |          }|S )a  
    Get all documents in a collection.

    Returns a Select query for pagination support via apaginate().
    Results are ordered by created_at timestamp.

    Args:
        workspace_name: Name of the workspace
        observer: Name of the observing peer
        observed: Name of the observed peer
        filters: Optional filters to apply
        reverse: Whether to reverse the order (oldest first)

    Returns:
        Select query for documents
    N)r   r   Documentwherer!   r"   r#   
deleted_atis_r   order_by
created_atascdescr    )r!   r"   r#   r   r   r    stmts          9/DATA/AppData/hermes/projects/honcho/src/crud/document.pyget_all_documentsr0   #   s    4 	v	v-?	@	@	v'83	4	4	v'83	4	4	v)--d33	4	4 	 fow77D  @}}V_7;;==>>}}V_7<<>>??zz%  K    )r   r   c                   t          t          j                                      t          j        j        | k                                  t          j        j                            d                    }t          |t          j        |          }|r7|                    t          j        j	        
                                          }n6|                    t          j        j	                                                  }|S )ax  
    Get all documents using custom filters.

    Returns a Select query for pagination support via apaginate().
    Results are ordered by created_at timestamp.

    Args:
        workspace_name: Name of the workspace
        filters: Optional filters to apply
        reverse: Whether to reverse the order (oldest first)

    Returns:
        Select query for documents
    N)r   r   r&   r'   r!   r(   r)   r   r*   r+   r,   r-   )r!   r   r   r.   s       r/   get_documents_with_filtersr3   S   s    * 	v	v-?	@	@	v)--d33	4	4 	 fow77D  @}}V_7;;==>>}}V_7<<>>??Kr1   
   )r    session_namedbr5   c                  K   t          t          j                                      t          j        j        |k    t          j        j        |k    t          j        j        |k    t          j        j                            d                    }|(|                    t          j        j	        |k              }|
                    t          j        j                                                                      |          }|                     |           d{V }|                                                                S )a  
    Query most recent documents.

    Args:
        db: Database session
        workspace_name: Name of the workspace
        observer: Name of the observing peer
        observed: Name of the observed peer
        limit: Maximum number of documents to return
        session_name: Optional session name to filter by

    Returns:
        Sequence of documents ordered by created_at descending
    N)r   r   r&   r'   r!   r"   r#   r(   r)   r5   r*   r+   r-   r    executescalarsall)r6   r!   r"   r#   r    r5   r.   results           r/   query_documents_recentr<   y   s      . &/""((&.8 H, H,"&&t,,	 D zz&/6,FGG==388::;;AA%HHD::d########F>>!!!r1   )r    c                *  K   t          t          j                                      t          j        j        |k    t          j        j        |k    t          j        j        |k    t          j        j                            d                    	                    t          j        j
                                                                      |          }|                     |           d{V }|                                                                S )a~  
    Query documents sorted by times_derived (most reinforced first).

    Args:
        db: Database session
        workspace_name: Name of the workspace
        observer: Name of the observing peer
        observed: Name of the observed peer
        limit: Maximum number of documents to return

    Returns:
        Sequence of documents ordered by times_derived descending
    N)r   r   r&   r'   r!   r"   r#   r(   r)   r*   times_derivedr-   r    r8   r9   r:   )r6   r!   r"   r#   r    r.   r;   s          r/   query_documents_most_derivedr?      s      , 	v	O*n<O$0O$0O&**400	

 

 
&//4466	7	7	u 	 ::d########F>>!!!r1   c                  P    t           j        j        dk    pt           j        j         S )z@Check whether queries should go through pgvector (DB-only) path.pgvector)r   VECTOR_STORETYPEMIGRATED r1   r/   _uses_pgvectorrF      s%     	"j0V8M8V4Vr1      	embeddingtop_kmax_distancec                   K   t                      rdS t                      }|g S |                    d| ||          }i }	|rdD ]}
|
|v r||
         |	|
<   |                    |||||	r|	ndd           d{V }|sg S d |D             S )ua  Query external vector store for document IDs sorted by similarity.

    No DB session needed — safe to call outside a tracked_db scope.

    Returns:
        Ordered list of document IDs on the external-store path,
        empty list when the external store has no results,
        or None when the pgvector (DB-only) path should be used instead.
    Ndocument)levelr5   F)rI   rJ   r   include_attributesc                     g | ]	}|j         
S rE   id).0r;   s     r/   
<listcomp>z6query_external_vector_document_ids.<locals>.<listcomp>   s    333&FI333r1   )rF   r   get_vector_namespacequery)r!   r"   r#   rH   rI   rJ   r   external_vector_store	namespacevector_filterskeyvector_resultss               r/   "query_external_vector_document_idsr[      s      $  t577$	%::NHh I &(N 3, 	3 	3Cg~~&-cls#066!"0:d  7        N  	33N3333r1   document_idsc                   K   |sg S t          t          j                                      t          j        j        |k                                  t          j        j        |k                                  t          j        j        |k                                  t          j        j                            d                                        t          j        j	        
                    |                    }t          |t          j        |          }|                     |           d{V }d |                                                                D             fd|D             S )zBFetch documents by IDs, preserving input order. DB-only operation.Nc                     i | ]
}|j         |S rE   rP   rR   docs     r/   
<dictcomp>z*fetch_documents_by_ids.<locals>.<dictcomp>  s    ??????r1   c                 (    g | ]}|v |         S rE   rE   )rR   doc_id	documentss     r/   rS   z*fetch_documents_by_ids.<locals>.<listcomp>  s(    PPP&Fi<O<OIf<O<O<Or1   )r   r   r&   r'   r!   r"   r#   r(   r)   rQ   in_r   r8   r9   r:   )	r6   r!   r"   r#   r\   r   r.   r;   rd   s	           @r/   fetch_documents_by_idsrf      s*       	 	v	v-?	@	@	v'83	4	4	v'83	4	4	v)--d33	4	4	v!%%l33	4	4 	 fow77D::d########F??(8(8(<(<(>(>???IPPPPLPPPPr1   c                   K   t          t          j                                      t          j        j        |k                                  t          j        j        |k                                  t          j        j        |k                                  t          j        j                            d                                        t          j        j	        
                    d                    }|;|                    t          j        j                            |          |k              }t          |t          j        |          }|                    t          j        j                            |                                        |          }|                     |           d{V }	t!          |	                                                                          S )u1   pgvector similarity search — pure DB operation.N)r   r   r&   r'   r!   r"   r#   rH   isnotr(   r)   cosine_distancer   r*   r    r8   listr9   r:   )
r6   r!   r"   r#   rH   r   rJ   rI   r.   r;   s
             r/   _query_documents_pgvectorrk     sj      	v	v-?	@	@	v'83	4	4	v'83	4	4	v(..t44	5	5	v)--d33	4	4 	 zzO%55i@@LP
 
 fow77D==2BB9MMNNTT D ::d########F  $$&&'''r1   )r   rJ   rI   rH   rU   c                H  K   |S	 t          j        |           d{V }n7# t          $ r*}	t          dt          j        j         dz             |	d}	~	ww xY wt                      r| t          | |||||||           d{V S t          d          4 d{V }
t          |
|||||||           d{V }|D ]}|

                    |           |cddd          d{V  S # 1 d{V swxY w Y   t          |||||||           d{V }|sg S | t          | |||||           d{V S t          d          4 d{V }
t          |
|||||           d{V }|D ]}|

                    |           |cddd          d{V  S # 1 d{V swxY w Y   dS )aq  
    Query documents using semantic similarity.

    When *db* is provided the caller owns the session lifetime.  When *db* is
    ``None`` the function opens (and closes) its own short-lived session so that
    no DB connection is held during external vector-store calls.

    Args:
        db: Database session, or None to let the function manage its own
        workspace_name: Name of the workspace
        query: Search query text
        observer: Name of the observing peer
        observed: Name of the observed peer
        filters: Optional filters to apply at vector store level (supports: level, session_name)
        max_distance: Maximum cosine distance for results
        top_k: Number of results to return
        embedding: Optional pre-computed embedding for the query (avoids extra API call if possible)

    Returns:
        Sequence of matching documents
    Nz%Query exceeds maximum token limit of .zquery_documents.pgvector)r!   r"   r#   rH   rI   rJ   r   )r6   r!   r"   r#   r\   r   zquery_documents.fetch)r   embed
ValueErrorr   r   	EMBEDDINGMAX_INPUT_TOKENSrF   rk   r   expunger[   rf   )r6   r!   rU   r"   r#   r   rJ   rI   rH   e
managed_dbdocsr`   r\   s                 r/   query_documentsrv   <  s     D 	.4U;;;;;;;;II 	 	 	%7'8;;;<  	  >2	 	 	 	 	 	 	 	 	 899 	 	 	 	 	 	 	Z2	 	 	 	 	 	 	 	D  ( (""3''''	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	  <%!        L  		~+)%
 
 
 
 
 
 
 
 
 	
 122       j+)%
 
 
 
 
 
 
 
 
  	$ 	$Cs####                             s8   ! 
A%AA8C$$
C.1C.7F
FF)deduplicaterd   rw   c                V  K   g }g }g }|D ]}		 |rt          | |	|||           d{V }
|
r#|	j                            d          }t          j        j        dk    pt          j        j         }|rD|	j        r=t          j	        ||||	j
        |	j        |	j        ||	j        |	j        |	j        
  
        }n6t          j	        ||||	j
        |	j        |	j        ||	j        |	j        	  	        }|	j        rd|_        |                    |           |                    |	           |	j        r|                    ||	j        f           ># t"          $ r9}t$                              d	| d
|	j         d
| d
| d| 
           Y d}~|d}~ww xY w	 |                     |           |                                  d{V  |rd |D             }t-                      }||                     t1          t          j	                                      t          j	        j                            |                                        dt;          j                    d                     d{V  |                                  d{V  n|                    d|||          }g }|D ]?\  }	}|                    tA          |	j        |||||	j        |	j        d                     @	 |!                    ||           d{V  |                     t1          t          j	                                      t          j	        j                            |                                        dt;          j                    d                     d{V  |                                  d{V  n# tD          $ r t$          #                    d           |                     t1          t          j	                                      t          j	        j                            |                                        t          j	        j$        dz   t;          j                                         d{V  |                                  d{V  Y nt"          $ r t$          %                    d           |                     t1          t          j	                                      t          j	        j                            |                                        t          j	        j$        dz   t;          j                                         d{V  |                                  d{V  Y nw xY wn<# tL          $ r/}| '                                 d{V  tQ          d          |d}~ww xY w|S )a  
    Create multiple documents with optional duplicate detection.

    Args:
        db: Database session
        documents: List of document creation schemas
        workspace_name: Name of the workspace
        observer: Name of the observing peer
        observed: Name of the observed peer

    Returns:
        List of DocumentCreate schemas that were actually inserted (excludes
        duplicates and failures).
    r"   r#   NT)exclude_nonerA   )
r!   r"   r#   contentrM   r>   internal_metadatar5   rH   
source_ids)	r!   r"   r#   r{   rM   r>   r|   r5   r}   pendingzError adding new document to /: c                 "    g | ]\  }}|j         S rE   rP   )rR   r`   _s      r/   rS   z$create_documents.<locals>.<listcomp>  s    AAA&#qsvAAAr1   syncedr   
sync_statelast_sync_atsync_attemptsrL   r!   r"   r#   r5   rM   rQ   rH   metadataz/Vector store unavailable; leaving docs unsynced   r   r   z"Unexpected error upserting vectorsz@Failed to create documents due to integrity constraint violation))is_rejected_duplicater   
model_dumpr   rB   rC   rD   rH   r   r&   r{   rM   r>   r5   r}   r   append	Exceptionloggererroradd_allcommitr   r8   r	   r'   rQ   re   valuesr   nowrT   r   upsert_manyr   warningr   	exceptionr   rollbackr   )r6   rd   r!   r"   r#   rw   honcho_documentsaccepted_documentsdocs_with_embeddingsr`   is_duplicatemetadata_dictstore_embeddings_in_postgresnew_docrs   doc_idsrV   rW   vector_recordsrH   s                       r/   create_documentsr     s     . /179FH > >=	  %:^h& & &               L333FFM
 %*j8 6,55 )
 ,   /#1%%K)"%"3&3!$!1!m"~   !/#1%%K)"%"3&3!$!1"~   } /%."##G,,,%%c*** } F$++Wcm,DEEE 	 	 	LLnnnAQnnT\nn_gnnklnn   HHHH		_


#$$$
 iikk   P	&AA,@AAAG$=$?$?! %,jj6?++U6?-11'::;;V#+%)XZZ&'            iikk!!!!!!!! 2FF"	 	 68&:  NC"))$"v&/2@,4,4030@),& &
 
 
   %&/;;I~VVVVVVVVV**v//v155g>>??'/)-*+             ))++%%%%%%%%' & & &NN#TUUU**v//v155g>>??*0/*G!*K)-             ))++%%%%%%%%%  
& 
& 
&$$%IJJJ**v//v155g>>??*0/*G!*K)-             ))++%%%%%%%%%
&    kkmm!N
 
	 si   E	DE		
F.FFEU- %CN+ )U- +CU)U- 
CU)&U- (U))U- -
V&7*V!!V&)r5   document_idc                  K   t           j        j        |k    t           j        j        |k    t           j        j        |k    t           j        j        |k    t           j        j                            d          g}|(|                    t           j        j	        |k                t          t           j                  j        |                     t          j                              }t          t           t"                   |                     |           d{V           }|j        dk    rt)          d| d          |                                  d{V  dS )a  
    Soft-delete a document by ID.

    Sets deleted_at timestamp to mark the document as deleted. The reconciliation
    job handles vector store cleanup and hard deletion from the database.

    Args:
        db: Database session
        workspace_name: Name of the workspace
        document_id: ID of the document to delete
        observer: Name of the observing peer (for authorization)
        observed: Name of the observed peer (for authorization)
        session_name: Optional session name to verify document belongs to session

    Raises:
        ResourceNotFoundException: If document not found or doesn't match criteria
    Nr(   r   	Document zA not found or does not belong to the specified collection/session)r   r&   rQ   r!   r"   r#   r(   r)   r   r5   r	   r'   r   r   r   r   r
   r   r8   rowcountr   r   )	r6   r!   r   r"   r#   r5   
conditionsupdate_stmtr;   s	            r/   delete_documentr   h  s=     6 	k)&.8 H, H,"&&t,,J &/6,FGGG 	&v%z299TXZZ9PP  ,s#2::k+B+B%B%B%B%B%B%BCCF!'ffff
 
 	
 ))++r1   c                  K   |sg S t           j        j                            |          t           j        j        |k    t           j        j        |k    t           j        j        |k    t           j        j                            d          g}|(|	                    t           j        j
        |k                t          t           j                  j        |                     t          j                                                  t           j        j        t           j        j                  }|                     |           d{V }|                                }	|                                  d{V  d |	D             S )uz  
    Soft-delete multiple documents in a single UPDATE ... RETURNING statement.

    Returns (id, level) tuples for rows that actually got deleted — i.e. rows
    that matched the workspace/observer/observed filter and were not already
    soft-deleted. IDs that didn't match are silently skipped; callers can diff
    the returned ids against the input to detect misses.
    Nr   c                 *    g | ]}|j         |j        fS rE   )rQ   rM   )rR   rows     r/   rS   z$delete_documents.<locals>.<listcomp>  s!    000CSVSY000r1   )r   r&   rQ   re   r!   r"   r#   r(   r)   r   r5   r	   r'   r   r   r   	returningrM   r8   r:   r   )
r6   r!   r\   r"   r#   r5   r   r.   r;   rowss
             r/   delete_documentsr     sK     "  	 	|,,&.8 H, H,"&&t,,J &/6,FGGG	v	
		48::	&	&	6?%v'<	=	=	 	 ::d########F::<<D
))++0040000r1   c                 &  K   t          t          j                                      t          j        j        |k    t          j        j        |k    t          j        j                            d                                        t          j
                              }t          t          t                   |                     |           d{V           }|j        dk    rt!          d| d|           |                                  d{V  dS )a  
    Soft-delete a document by ID and workspace.

    Sets deleted_at timestamp to mark the document as deleted. The reconciliation
    job handles vector store cleanup and hard deletion from the database.

    Args:
        db: Database session
        workspace_name: Name of the workspace
        document_id: ID of the document to delete

    Raises:
        ResourceNotFoundException: If document not found or doesn't belong to the workspace
    Nr   r   r   z+ not found or does not belong to workspace )r	   r   r&   r'   rQ   r!   r(   r)   r   r   r   r   r
   r   r8   r   r   r   )r6   r!   r   r   r;   s        r/   delete_document_by_idr     s      ( 	v	O+-O*n<O&**400

 


 
48::	&	&  ,s#2::k+B+B%B%B%B%B%B%BCCF!'```P^``
 
 	
 ))++r1   observationsc                 
  K   |sg S t                      }t                      }t                      }|D ]x}|j        |                    |j                   |                    |j                   |                    |j                   |                    |j        |j        f           y|D ]}t          | ||           d{V  |D ],}t          | |t          j        |                     d{V  -|D ]\  }	}
t          | ||	|
           d{V  d |D             }	 t          j        |           d{V }n/# t          $ r"}t          t          |                    |d}~ww xY wg }i }t          j        j        dk    pt          j        j         }t'          ||d          D ]\  }}|r2t)          j        ||j        |j        |j        dd	i |j        |
	  	        }n0t)          j        ||j        |j        |j        dd	i |j                  }d|_        |                    |           |j        |j        f}||vrg ||<   ||                             ||f           	 |                     |           |                                  d{V  |D ]}|                     |           d{V  t9                      }d |D             }||                     t=          t(          j                                      t(          j        j         !                    |                    "                    dtG          j$                    d                     d{V  |                                  d{V  n!|%                                D ]\  \  }	}
}|&                    d||	|
          }g }g }|D ]Y\  }}|                    |j                    |                    tO          |j         |||	|
|j(        |j)        d                     Z	 |*                    ||           d{V  |                     t=          t(          j                                      t(          j        j         !                    |                    "                    dtG          j$                    d                     d{V  |                                  d{V  H# tV          $ r tX          -                    d|           |                     t=          t(          j                                      t(          j        j         !                    |                    "                    t(          j        j.        d	z   tG          j$                                         d{V  |                                  d{V  Y )t^          $ r tX          0                    d|           |                     t=          t(          j                                      t(          j        j         !                    |                    "                    t(          j        j.        d	z   tG          j$                                         d{V  |                                  d{V  Y 	w xY wn<# tb          $ r/}| 2                                 d{V  t          d          |d}~ww xY wtX          3                    dti          |          |           |S )a3  
    Create multiple observations (documents) from user input.

    This function validates all referenced resources, generates embeddings
    in batch, and creates the documents.

    Args:
        db: Database session
        observations: List of observation creation schemas
        workspace_name: Name of the workspace

    Returns:
        List of created Document objects

    Raises:
        ResourceNotFoundException: If any session or peer is not found
        ValidationException: If embedding generation fails or integrity constraint is violated
    N)namery   c                     g | ]	}|j         
S rE   )r{   )rR   obss     r/   rS   z'create_observations.<locals>.<listcomp>  s    444444r1   rA   T)strictexplicitr   )	r!   r"   r#   r{   rM   r>   r|   r5   rH   )r!   r"   r#   r{   rM   r>   r|   r5   r~   c                     g | ]	}|j         
S rE   rP   r_   s     r/   rS   z'create_observations.<locals>.<listcomp>[  s    :::#sv:::r1   r   r   r   rL   r   r   zHVector store unavailable for namespace %s; leaving observations unsyncedr   z)Unexpected error upserting vectors for %szCFailed to create observations due to integrity constraint violationz'Created %d observations in workspace %s)5set
session_idaddobserver_idobserved_idr   r   r   
PeerCreater   r   simple_batch_embedro   r   strr   rB   rC   rD   zipr   r&   r{   r   r   r   r   refreshr   r8   r	   r'   rQ   re   r   r   r   itemsrT   r   r5   rM   r   r   r   r   r   r   r   r   r   debuglen)r6   r   r!   sessions_to_validatepeers_to_validatecollection_pairsr   r5   	peer_namer"   r#   contents
embeddingsrs   r   collection_embeddingsr   rH   r`   collection_keyrV   all_doc_idsr   rW   r   r   s                             r/   create_observationsr     s     .  	 &)UU"%%%-0UU A A>% $$S^444co...co...cos?@@@@ - < <"lN;;;;;;;;;; ' O O	r>7+=9+M+M+MNNNNNNNNNN / 
 
(&H
 
 
 	
 	
 	
 	
 	
 	
 	
 	

 54|444H1+>xHHHHHHHH

 1 1 1!#a&&))q01 /1 	  	"j0V8M8V4V ! lJtDDD G GY' 	/- "$ ^#
 
 
CC /- "$ ^	 	 	C #$$$ /3?;!66646!.1n-44c95EFFFFg


#$$$iikk# 	" 	"C**S//!!!!!!!! !: ; ;::)9::: !(**v''v)--k::;;'!%"#            ))++ (='B'B'D'DH& H& $ #1FF"	 	 68%'&:  NCNN36***"))$"v&/2@,4,4030@),& &
 
 
   )&/;;I~VVVVVVVVV**v//v155g>>??'/)-*+             ))++%%%%%%%%' & & &NNb!   **v//v155g>>??*0/*G!*K)-             ))++%%%%%%%%%  & & &$$CY   **v//v155g>>??*0/*G!*K)-             ))++%%%%%%%%%&    kkmm!Q
 
	 LL1  
 sd   %E   
E,
E''E,'F*Z CSZ CZ6Z 9CZZ ZZ 
[(*[[r`   c          
      &  K   t          | ||j        ||dd|j                   d{V }|sdS |d         }t          t          j                            |j                            }t          t          j                            |j                            }t          ||z
            }	t          ||z
            }
t          |          |	dz  z   }t          |          |
dz  z   }||k    rzt          	                    d|j         d	|j         d
           t          j
                            t          j        j                  |_        |                                  d{V  dS t          	                    d|j         d	|j         d
           dS )a  
    Check if a document is a duplicate of an existing document.

    Uses: 1) Cosine similarity (>=0.95), 2) Token diff for retention.

    Returns True if both:
    - the document is deemed a duplicate of an existing document
    - the existing document is deemed a superior duplicate

    If the document is not a duplicate, returns False.

    If the document is a duplicate AND the new document is superior,
    deletes the existing document and returns False.
    g?r   )r6   r!   rU   r"   r#   rJ   rI   rH   NFr   r4   z>[DUPLICATE DETECTION] Deleting existing in favor of new. new='z', existing='z'.z?[DUPLICATE DETECTION] Rejecting new in favor of existing. new='T)rv   r{   rH   r   r   encodingencoder   r   r   datetimer   timezoneutcr(   flush)r6   r`   r!   r"   r#   similar_docsexisting_doc
tokens_newtokens_existing
unique_newunique_existing	score_newscore_existings                r/   r   r     s     . )%k-	 	 	 	 	 	 	 	 	L  u?L %.55ckBBCCJ*3::<;OPPQQOZ/122J/J677OJ:?3I))_r-ABN N""S[gsg{	
 	
 	
 #+"3"7"78I8M"N"Nhhjju NN|#+||dpdx|||   4r1   d   rV   
batch_sizeolder_than_minutesc                   K   t           j                             t           j        j                  t          j        |          z
  }t          t          j                                      t          j        j	        
                    d                                        t          j        j	        |k                                   |                              d          }|                     |           d{V }t          |                                                                          }|sdS i }|D ]W}	|                    d|	j        |	j        |	j                  }
|                    |
g                               |	j                   Xt1                      }|                                D ]n\  }
}	 |                    |
|           d{V  |                    |           8# t8          $ r*}t:                              d|
 d|            Y d}~gd}~ww xY w|r|                     t?          t          j                                      t          j        j                             |                               d{V  | !                                 d{V  t:          "                    d	tG          |           d
           tG          |          S | $                                 d{V  dS )a  
    Cleanup soft-deleted documents by removing their vectors and database records.

    This function implements a two-phase cleanup process for documents that have been
    soft-deleted (deleted_at is not NULL)

    Args:
        db: Database session for executing queries
        external_vector_store: External vector store instance for deleting vectors
        batch_size: Maximum number of documents to process per call (default 100)
        older_than_minutes: Only process documents soft-deleted more than this many
            minutes ago (default 5).

    Returns:
        Count of documents cleaned up (only those where vector deletion succeeded).
    )minutesNT)skip_lockedr   rL   zFailed to delete vectors from r   zCleaned up z soft-deleted documents)%r   r   r   r   	timedeltar   r   r&   r'   r(   is_notr    with_for_updater8   rj   r9   r:   rT   r!   r"   r#   
setdefaultr   rQ   r   r   delete_manyr	   r   r   r   r   re   r   r   r   r   )r6   rV   r   r   cutoffr.   r;   rd   by_namespacer`   rW   successfully_deleted_idsidsrs   s                 r/   cleanup_soft_deleted_documentsr     s%     , ""8#4#899H<N"= = = F 	v	v)0066	7	7	v)F2	3	3	z			T	*	* 	 ::d########FV^^%%))++,,I q *,L > >)>>LL	
 
	 		2..55cf==== *-&,,.. N N	3	N'33IsCCCCCCCCC$++C0000 	N 	N 	NNNLILLLLMMMMMMMM	N
   
-jj6?##))"&&'?@@ 
 
 	
 	
 	
 	
 	
 	
 	

 iikkP#677PPP	
 	
 	
 +,,, ++--1s   81G**
H4 HHc                   K   |sg S t          t          j                                      t          j        j        |k    t          j        j                            |          t          j        j                            d                    }| 	                    |           d{V }|
                                                                S )a"  
    Get multiple documents by their IDs.

    Args:
        db: Database session
        workspace_name: Workspace identifier
        document_ids: List of document IDs to retrieve

    Returns:
        Sequence of documents found (may be fewer than requested if some IDs don't exist)
    N)r   r   r&   r'   r!   rQ   re   r(   r)   r8   r9   r:   )r6   r!   r\   r.   r;   s        r/   get_documents_by_idsr   ]  s         	&/""((&.8|,,"&&t,, D
 ::d########F>>!!!r1   ry   	parent_idc                <  K   t          t          j                                      t          j        j        |k    t          j        j                            |g          t          j        j                            d                    }|r(|                    t          j        j	        |k              }|r(|                    t          j        j
        |k              }|                     |           d{V }|                                                                S )a  
    Get all observations that have this document as a source/premise.

    Useful for traversing the reasoning tree upward (source -> derived observations).
    Uses GIN index on source_ids for efficient lookups.

    Args:
        db: Database session
        workspace_name: Workspace identifier
        parent_id: Document ID to find children of
        observer: Optional filter by observer
        observed: Optional filter by observed

    Returns:
        Sequence of documents that reference this document as a source
    N)r   r   r&   r'   r!   r}   containsr(   r)   r"   r#   r8   r9   r:   )r6   r!   r   r"   r#   r.   r;   s          r/   get_child_observationsr   x  s      2 &/""((&.8"++YK88"&&t,, D
  @zz&/2h>?? @zz&/2h>??::d########F>>!!!r1   )rG   NN)N)r   rG   )Mr   collections.abcr   loggingr   typingr   r   
sqlalchemyr   r   r	   sqlalchemy.enginer
   sqlalchemy.excr   sqlalchemy.ext.asyncior   sqlalchemy.sqlr   sqlalchemy.sql.functionsr   srcr   r   
src.configr   src.crud.collectionr   src.crud.peerr   src.crud.sessionr   src.dependenciesr   src.embedding_clientr   src.exceptionsr   r   r   src.utils.filterr   src.vector_storer   r   r   __name__r   r   dictboolinttupler&   r0   r3   r<   r?   rF   rj   floatr[   rf   rk   rv   DocumentCreater   r   r   r   ConclusionCreater   r   r   r   r   rE   r1   r/   <module>r     se	    $ $ $ $ $ $               - - - - - - - - - - * * * * * * ) ) ) ) ) ) / / / / / / ! ! ! ! ! ! ) ) ) ) ) )               8 8 8 8 8 8 " " " " " " ( ( ( ( ( ( ' ' ' ' ' ' 1 1 1 1 1 1         
 * ) ) ) ) )          
8		 &*- - -- - 	-
 #s(^d"- - :- E&/"#- - - -f &*	# # ## #s(^d"# 	#
 E&/"## # # #X #$" $" $"$"$" 	$"
 $" $" *$" fo$" $" $" $"Z "" "" """""" 	""
 "" "" fo"" "" "" ""J     !%%)/4 /4/4/4 /4 E{	/4
 /4 $,/4 #s(^d"/4 
#Y/4 /4 /4 /4p &*Q QQQ Q 	Q
 s)Q #s(^d"Q 
&/Q Q Q Q8((( ( 	(
 E{( #s(^d"( $,( ( 
&/( ( ( (R &*!%$(i i itii i
 i i #s(^d"i $,i i E{T!i foi i i if } } }}G*+} }
 } } } 
'
 !} } } }N  $. . ... .
 . . *. 
. . . .p  $'1 '1 '1'1'1 3-'1
 '1 '1 *'1 
%S/'1 '1 '1 '1T### # 
	# # # #LWW734W W 
&/	W W W Wt??		? ?
 ? ? 
? ? ? ?J 	P PP&P P 	P
 	P P P Pp""" s)" fo	" " " "@  $" $" $"$"$" $"
 Dj$" Dj$" fo$" $" $" $" $" $"r1   