
    yj)                    <   U d Z ddlmZ ddlZddlmZmZ ddlmZ ddl	m
Z
 ddlmZmZmZmZmZmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ  ej        e          ZdZde d<   dZ!dZ"dZ# G d de          Z$ddd0dZ%d1dZ&d1d Z'd2d$Z(d3d%Z)d4d(Z*d5d*Z+d6d/Z,dS )7u  Startup validator for the embedding pipeline.

Crashes the process at boot if the configured EMBEDDING_VECTOR_DIMENSIONS does
not match the physical pgvector schema. Replaces an earlier config-time guard
that forbade non-1536 dims unless the operator asserted a VECTOR_STORE.MIGRATED
flag — the schema introspection here is more accurate because it inspects
actual state instead of operator-asserted state.

For external stores (turbopuffer, lancedb) the check is best-effort: namespaces
are per-workspace and lazy-created, so this validator can only sample existing
ones. Full enumeration is available via `uv run python scripts/configure_embeddings.py --report`.
    )annotationsN)selecttext)SQLAlchemyError)AsyncEngine)AsyncRetrying
RetryErrorbefore_sleep_logretry_if_exception_typestop_after_attempt
wait_fixed)AppSettingssettings)HonchoException)
Collection	Workspace)VectorStore)	documentsmessage_embeddingsztuple[str, ...]_EMBEDDING_TABLES   g      ?
   c                      e Zd ZdZdS )StartupValidationErroru  Raised when the embedding configuration cannot be reconciled with the
    physical schema. Always surfaced before any HTTP route is served or any
    queue task is processed.

    Inherits from ``HonchoException`` (status_code=500) so the project's
    exception handlers recognize it consistently. Startup-time failure, not
    a per-request validation error — ``ValidationException``'s 422 semantics
    would be misleading.
    N)__name__
__module____qualname____doc__     G/DATA/AppData/hermes/projects/honcho/src/startup/embedding_validator.pyr   r   1   s           r    r   )app_settingsenginer   r"   AppSettings | NonereturnNonec                  K   ||nt           }|j        j        }|j        j        }t          | |           d{V }t          |||           |j        j        dv rt          | |           d{V  dS dS )a+  Validate that the embedding schema matches the configured dimension.

    Run after the DB pool is initialized and before the embedding client is
    constructed. Fails closed: any unrecoverable introspection error raises
    rather than letting the process serve traffic with an unknown state.
    N)schema
target_dim)turbopufferlancedb)r)   )
r   	EMBEDDINGVECTOR_DIMENSIONSDBSCHEMA$_introspect_pgvector_dims_with_retry_assert_pgvector_dims_matchVECTOR_STORETYPE_sample_external_namespaces)r#   r"   sr)   r(   dimss         r!   validate_embedding_schemar7   =   s       %0hA.JT[F5ffEEEEEEEEDV
KKKK~888)&ZHHHHHHHHHHHH 98r    r(   strdict[str, int]c           	       K   	 t          t          t                    t          t                    t          t                    t          t          t          j
                  d          2 3 d{V }|5  t          | |           d{V cddd           c S # 1 swxY w Y   :6 n># t          $ r1}|j                                        }t          d|           |d}~ww xY wt          d          )u   Schema-qualified pg_attribute introspection with bounded retries.

    Returns a mapping of table name -> raw ``atttypmod`` for the embedding
    columns. Fails closed on the last attempt — uncertainty is not a green
    light to serve traffic.
    F)stopwaitretrybefore_sleepreraiseNz%could not validate embedding schema: z*embedding schema introspection did not run)r   r   _RETRY_ATTEMPTSr   _RETRY_BACKOFF_SECONDSr   r   r
   loggerloggingWARNING_introspect_pgvector_dims_oncer	   last_attempt	exceptionr   )r#   r(   attempte
underlyings        r!   r0   r0   S   s     *#O44233)/::)&'/BB
 
 
 	L 	L 	L 	L 	L 	L 	L'  L L;FFKKKKKKKKL L L L L L L L L L L L L L L L L L L
 
    ^--//
$@J@@
 
	 !!M
N
NNsH   A'B' +B%1B' 4B
B' B	B'  B	!B' '
C"1,CC"c                  K   t          d          }|                                 4 d{V }|                    ||t          t                    d           d{V }d |D             cddd          d{V  S # 1 d{V swxY w Y   dS )zSingle-shot schema-qualified pg_attribute lookup.

    The join through ``pg_class``/``pg_namespace`` lets us respect
    ``DB.SCHEMA`` rather than relying on the ambient search_path.
    a6  
        SELECT c.relname AS table_name, a.atttypmod AS typmod
        FROM pg_attribute a
        JOIN pg_class c ON a.attrelid = c.oid
        JOIN pg_namespace n ON c.relnamespace = n.oid
        WHERE n.nspname = :schema
          AND c.relname = ANY(:tables)
          AND a.attname = 'embedding'
        N)r(   tablesc                (    i | ]}|j         |j        S r   )
table_nametypmod.0rows     r!   
<dictcomp>z2_introspect_pgvector_dims_once.<locals>.<dictcomp>   s    ===s
===r    )r   connectexecutelistr   )r#   r(   queryconnresults        r!   rE   rE   o   sD      	
 
E ~~ > > > > > > >4||.?)@)@AA
 
 
 
 
 
 
 
 >=f===> > > > > > > > > > > > > > > > > > > > > > > > > > > > > >s   =A<<
B	Br6   r)   intc          	        t          t                    }||                                 z
  }|rDd                    t	          fd|D                                 }t          d| ddz             t	          |          D ]X}| |         }|dk    rt           d| ddz   d	z             |}||k    r%t           d| d
| dd| dz   dz   dz             Yd S )N, c              3  (   K   | ]} d | dV  dS ).z
.embeddingNr   )rQ   tr(   s     r!   	<genexpr>z._assert_pgvector_dims_match.<locals>.<genexpr>   s4      "N"Nf#<#<q#<#<#<"N"N"N"N"N"Nr    z!Required vector columns missing: r^   z" Run `alembic upgrade head` first.z+.embedding has no declared vector dimensionz (unbounded typmod). Runz1 `uv run python scripts/configure_embeddings.py`.z.embedding dim (z) does not matchz EMBEDDING_VECTOR_DIMENSIONS (z). Runz0 `uv run python scripts/configure_embeddings.py`z$ or fix EMBEDDING_VECTOR_DIMENSIONS.)setr   keysjoinsortedr   )	r6   r(   r)   expectedmissinglistingtable	atttypmodactuals	    `       r!   r1   r1      s[    $%%H$G 
))F"N"N"N"Ng"N"N"NNNOO$::::23
 
 	
 !!  K	??(NNENNN,-EF   Z(KKEKK6KKKE:EEEFDE 99     r    c          	       K   t          | t                     d{V }t          | t                     d{V }|s|st                              d           dS ddlm}  |            }|dS g }|D ]+}|                    |                    d|                     ,|D ]2\  }}}	|                    |                    d|||	                     3g }
|D ]7}t          ||           d{V }|||k    r|
                    ||f           8|
r<d
                    d	 |
D                       }t          d
| dd| dz   dz             dS )u  Best-effort dim check across existing external-store namespaces.

    External stores in this codebase are per-workspace and lazy-created on
    first write (see ``src.vector_store.get_vector_namespace``), so there is
    no canonical deployment-wide namespace to introspect. We enumerate up to
    ``_EXTERNAL_SAMPLE_LIMIT`` of each namespace category from the application
    DB and probe each:

    - Message namespaces — one per workspace.
    - Document namespaces — one per existing ``(workspace, observer, observed)``
      collection triple.

    Missing namespaces are OK; mismatched dims crash startup. Run
    ``configure_embeddings --report`` for full enumeration when a hard
    guarantee is needed.
    NzQExternal-store validator: no workspaces or collections exist yet, skipping sampler   )get_external_vector_storemessagedocument)observerobservedr\   c              3  ,   K   | ]\  }}| d | dV  dS )z (dim=)Nr   )rQ   nsds      r!   r`   z._sample_external_namespaces.<locals>.<genexpr>   s7      HHEB//1///HHHHHHr    z/Existing external-store namespaces have dim != : z. Runz: `uv run python scripts/configure_embeddings.py --report`.)_sample_workspace_names_EXTERNAL_SAMPLE_LIMIT_sample_collection_keysrB   infosrc.vector_storerm   appendget_vector_namespace_probe_namespace_dimrd   r   )r#   r)   workspace_namescollection_keysrm   store
candidatesworkspace_namerp   rq   
mismatches	namespace
actual_dim	formatteds                 r!   r4   r4      s     " 4F<RSSSSSSSSO3F<RSSSSSSSSO ? !	
 	
 	
 	 ;:::::%%''E} 	J) Q Q%44YOOPPPP.= 
 
*(&&!!	 '  	
 	
 	
 	
 )+J 7 7	/yAAAAAAAA
!jJ&>&>y*5666 
IIHHZHHHHH	$KjKKK")"""#JK
 
 	

 
r    limit	list[str]c                  K   t          t          j                                      t          j                                                                      |          }|                                 4 d{V }|                    |           d{V }d |D             cddd          d{V  S # 1 d{V swxY w Y   dS )u-  Pull up to ``limit`` workspace names ordered by creation time.

    Uses the ORM ``Workspace`` model so ``Base.metadata.schema`` (configured
    from ``settings.DB.SCHEMA`` in ``src/db.py``) is honored automatically —
    a non-public schema deployment must not silently sample the wrong table.
    Nc                    g | ]
}|d          S )r   r   rP   s     r!   
<listcomp>z+_sample_workspace_names.<locals>.<listcomp>   s    )))3A)))r    )	r   r   nameorder_by
created_atdescr   rT   rU   r#   r   stmtrX   rY   s        r!   rx   rx      sA      ).!!**9+?+D+D+F+FGGMMeTTD~~ * * * * * * *4||D))))))))))&)))* * * * * * * * * * * * * * * * * * * * * * * * * * * * * *s   8'B22
B<?B<list[tuple[str, str, str]]c                  K   t          t          j        t          j        t          j                                      t          j                                                                      |          }| 	                                4 d{V }|
                    |           d{V }d |D             cddd          d{V  S # 1 d{V swxY w Y   dS )zPull up to ``limit`` ``(workspace_name, observer, observed)`` triples,
    one per existing collection row. Each triple corresponds to a document
    namespace that may exist in the external store.Nc                <    g | ]}|d          |d         |d         fS )r         r   rP   s     r!   r   z+_sample_collection_keys.<locals>.<listcomp>  s+    ;;;SQQQ(;;;r    )r   r   r   rp   rq   r   r   r   r   rT   rU   r   s        r!   rz   rz      sM      	z(**=z?RSS	*',,..	/	/	u 	
 ~~ < < < < < < <4||D))))))));;F;;;< < < < < < < < < < < < < < < < < < < < < < < < < < < < < <s   'C
CCr   r   r   
int | Nonec                <   K   |                      |           d{V S )a  Return the namespace's declared dim, or ``None`` if not present.

    Delegates to the store's own ``probe_namespace_dim`` implementation
    (lancedb opens the table, turbopuffer reads the schema). ``None`` means
    "lazy-create namespace, nothing to validate against."
    N)probe_namespace_dim)r   r   s     r!   r   r     s.       **9555555555r    )r#   r   r"   r$   r%   r&   )r#   r   r(   r8   r%   r9   )r6   r9   r(   r8   r)   rZ   r%   r&   )r#   r   r)   rZ   r%   r&   )r#   r   r   rZ   r%   r   )r#   r   r   rZ   r%   r   )r   r   r   r8   r%   r   )-r   
__future__r   rC   
sqlalchemyr   r   sqlalchemy.excr   sqlalchemy.ext.asyncior   tenacityr   r	   r
   r   r   r   
src.configr   r   src.exceptionsr   
src.modelsr   r   r|   r   	getLoggerr   rB   r   __annotations__r@   rA   ry   r   r7   r0   rE   r1   r4   rx   rz   r   r   r    r!   <module>r      s=     # " " " " "  # # # # # # # # * * * * * * . . . . . .                - , , , , , , , * * * * * * , , , , , , , , ( ( ( ( ( (		8	$	$ &I  H H H H    	 	 	 	 	_ 	 	 	 (,I I I I I I,O O O O8> > > >6   <<
 <
 <
 <
~
* 
* 
* 
*< < < < 6 6 6 6 6 6r    