"""
Turbopuffer vector store implementation.

This module provides a Turbopuffer-based implementation of the VectorStore interface.
"""

import logging
import re
from collections.abc import Sequence
from typing import Any, Literal, cast

from turbopuffer import AsyncTurbopuffer, InternalServerError, NotFoundError
from turbopuffer.lib.namespace import AsyncNamespace
from turbopuffer.types import Filter, RowParam

from src.config import settings
from src.exceptions import VectorStoreError

from . import VectorQueryResult, VectorRecord, VectorStore

logger = logging.getLogger(__name__)

# Type aliases for Turbopuffer's filter formats
EqFilter = tuple[str, Literal["Eq"], Any]
InFilter = tuple[str, Literal["In"], Sequence[Any]]
AndFilter = tuple[Literal["And"], Sequence[Filter]]

DISTANCE_METRIC = "cosine_distance"


class TurbopufferVectorStore(VectorStore):
    """
    Turbopuffer implementation of the VectorStore interface.

    Namespaces are generated via get_vector_namespace() which hashes
    the variable components to fit Turbopuffer's [A-Za-z0-9-_.]{1,128} limit.
    """

    tpuf: AsyncTurbopuffer

    def __init__(self):
        """
        Initialize the Turbopuffer vector store.
        """
        super().__init__()

        # Configure Turbopuffer client
        api_key = settings.VECTOR_STORE.TURBOPUFFER_API_KEY
        if not api_key:
            raise ValueError(
                "VECTOR_STORE_TURBOPUFFER_API_KEY must be set for Turbopuffer vector store"
            )

        # Initialize the async Turbopuffer client
        # Region can be configured via VECTOR_STORE_TURBOPUFFER_REGION or TURBOPUFFER_REGION env var
        region = settings.VECTOR_STORE.TURBOPUFFER_REGION or "gcp-us-east4"
        self.tpuf = AsyncTurbopuffer(api_key=api_key, region=region)

    def _get_namespace(self, namespace: str) -> AsyncNamespace:
        """Get a Turbopuffer namespace object."""
        return self.tpuf.namespace(namespace)

    async def upsert_many(
        self,
        namespace: str,
        vectors: list[VectorRecord],
    ) -> None:
        """
        Upsert multiple vectors into Turbopuffer.

        Args:
            namespace: The namespace to store the vectors in
            vectors: List of VectorRecord objects to upsert
        """
        if not vectors:
            return

        ns = self._get_namespace(namespace)

        # The dict literal carries arbitrary metadata fields, which RowParam supports
        # via extra_items=object. basedpyright can't see through the spread, so cast
        # via object per its reportInvalidCast guidance.
        # Spread metadata first so a caller-supplied "id" or "vector" key
        # can never clobber the required upsert fields.
        rows: list[RowParam] = [
            cast(
                RowParam,
                cast(
                    object,
                    {
                        **(v.metadata or {}),
                        "id": v.id,
                        "vector": v.embedding,
                    },
                ),
            )
            for v in vectors
        ]

        try:
            await ns.write(
                upsert_rows=cast(Any, rows),
                distance_metric=DISTANCE_METRIC,
            )
            return
        except InternalServerError as exc:
            # Turbopuffer unavailable. SDK implicitly retries 5xx responses,
            # so raise a vector store error and let callers leave writes unsynced.
            logger.warning(
                "Turbopuffer unavailable for upsert to namespace %s (%s after retries)",
                namespace,
                exc.status_code,
            )
            raise VectorStoreError(
                f"Turbopuffer unavailable for upsert to namespace {namespace}"
            ) from exc
        except Exception:
            logger.exception(
                f"Failed to upsert {len(vectors)} vectors to namespace {namespace}"
            )
            raise

    async def query(
        self,
        namespace: str,
        embedding: list[float],
        *,
        top_k: int = 10,
        filters: dict[str, Any] | None = None,
        max_distance: float | None = None,
        include_attributes: bool | list[str] = True,
    ) -> list[VectorQueryResult]:
        """
        Query for similar vectors in Turbopuffer.

        Args:
            namespace: The namespace to query
            embedding: The query embedding vector
            top_k: Maximum number of results to return
            filters: Optional metadata filters
            max_distance: Optional maximum distance threshold (cosine distance)
            include_attributes: Attributes to include in the response. Passing False
                avoids parsing unused row attributes.

        Returns:
            List of VectorQueryResult objects, ordered by similarity (most similar first)
        """
        ns = self._get_namespace(namespace)

        try:
            # Build filter conditions for Turbopuffer
            filter_condition = self._build_filters(filters) if filters else None

            # Query using rank_by for vector similarity
            # rank_by must be a tuple: (attribute, "ANN", vector)
            rank_by: tuple[str, Literal["ANN"], Sequence[float]] = (
                "vector",
                "ANN",
                embedding,
            )

            # Only pass filters if we have them (avoid passing None)
            query_kwargs: dict[str, Any] = {
                "rank_by": rank_by,
                "top_k": top_k,
                "distance_metric": DISTANCE_METRIC,
                "include_attributes": include_attributes,
            }
            if filter_condition is not None:
                query_kwargs["filters"] = filter_condition

            response = await ns.query(**query_kwargs)

            query_results: list[VectorQueryResult] = []
            for row in response.rows or []:
                # Distance is accessed via row["$dist"]
                dist: float = float(row["$dist"]) if "$dist" in row else 0.0
                # Filter by max_distance if specified
                if max_distance is not None and dist > max_distance:
                    continue

                # Extract attributes from model_extra (excludes id, vector, $dist)
                row_metadata: dict[str, Any] = {}
                if row.model_extra:
                    # Filter out internal fields like $dist
                    row_metadata = {
                        k: v
                        for k, v in row.model_extra.items()
                        if not k.startswith("$")
                    }

                query_results.append(
                    VectorQueryResult(
                        id=str(row.id),
                        score=dist,
                        metadata=row_metadata,
                    )
                )

            logger.debug(
                f"Query returned {len(query_results)} results from namespace {namespace}"
            )
            return query_results

        except NotFoundError:
            # Namespace doesn't exist yet - no vectors have been written
            # Return empty results (same behavior as LanceDB for missing tables)
            logger.debug(
                f"Namespace {namespace} does not exist, returning empty results"
            )
            return []

        except InternalServerError as exc:
            # Turbopuffer unavailable. SDK implicitly retries 5xx responses,
            # so we should return [].
            logger.warning(
                "Turbopuffer unavailable for query on namespace %s (%s after retries), returning empty results",
                namespace,
                exc.status_code,
            )
            return []

        except Exception:
            logger.exception(f"Failed to query namespace {namespace}")
            raise

    def _build_filters(self, filters: dict[str, Any]) -> Filter | None:
        """
        Convert a filter dict to Turbopuffer filter format.

        Turbopuffer uses tuples like (attribute, "Eq", value) for filters,
        (attribute, "In", [values]) for membership filters,
        and ("And", [filters]) for combining multiple filters.

        Supports filter formats:
        - {"key": "value"} -> ("key", "Eq", "value")
        - {"key": {"in": ["a", "b"]}} -> ("key", "In", ["a", "b"])

        Args:
            filters: Dictionary of attribute -> value filters

        Returns:
            Turbopuffer Filter or None if no filters
        """
        if not filters:
            return None

        filter_list: list[EqFilter | InFilter] = []
        for key, value in filters.items():
            # Check if value is a dict with "in" operator
            if isinstance(value, dict) and "in" in value:
                # Membership filter using "In" operator
                in_values = cast(Sequence[Any], value["in"])
                filter_list.append((key, "In", in_values))
            else:
                # Simple equality filter using "Eq" operator
                filter_list.append((key, "Eq", cast(Any, value)))

        if not filter_list:
            return None

        if len(filter_list) == 1:
            return filter_list[0]

        # Combine multiple filters with AND
        and_filter: AndFilter = ("And", filter_list)
        return and_filter

    async def delete_many(self, namespace: str, ids: list[str]) -> None:
        """
        Delete multiple vectors from Turbopuffer.

        Args:
            namespace: The namespace containing the vectors
            ids: List of vector identifiers to delete
        """
        if not ids:
            return

        ns = self._get_namespace(namespace)

        try:
            await ns.write(deletes=ids)
        except NotFoundError:
            # Namespace doesn't exist - nothing to delete
            logger.debug(f"Namespace {namespace} does not exist, nothing to delete")
        except InternalServerError as exc:
            logger.warning(
                "Turbopuffer unavailable for delete from namespace %s (%s after retries)",
                namespace,
                exc.status_code,
            )
            raise VectorStoreError(
                f"Turbopuffer unavailable while deleting vectors in namespace {namespace}"
            ) from exc
        except Exception:
            logger.exception(
                f"Failed to delete {len(ids)} vectors from namespace {namespace}"
            )
            raise

    async def delete_namespace(self, namespace: str) -> None:
        """
        Delete an entire namespace and all its vectors from Turbopuffer.

        Args:
            namespace: The namespace to delete
        """
        ns = self._get_namespace(namespace)

        try:
            await ns.delete_all()
            logger.debug(f"Deleted all vectors from namespace {namespace}")
        except NotFoundError:
            # Namespace doesn't exist - nothing to delete
            logger.debug(f"Namespace {namespace} does not exist, nothing to delete")
        except Exception:
            logger.exception(f"Failed to delete namespace {namespace}")
            raise

    async def close(self) -> None:
        """Close the Turbopuffer client and release resources."""
        await self.tpuf.close()
        logger.debug("Turbopuffer client closed")

    async def probe_namespace_dim(self, namespace: str) -> int | None:
        """Inspect a Turbopuffer namespace schema to recover the vector dim.

        Turbopuffer namespaces are lazy-created; ``namespace.exists()`` returns
        False before the first write. The schema response maps attribute name
        to ``AttributeSchemaConfig``; the vector field's ``type`` string is
        a bracket-prefixed dim with a width suffix, e.g. ``"[768]f32"``,
        ``"[1536]f16"``, ``"[256]i8"``.

        Returns ``None`` only when the namespace does not exist yet
        (NotFoundError or ``exists() == False``). When the namespace
        exists but its schema lacks a parseable ``vector`` attribute,
        raises ``VectorStoreError`` — silently bucketing that as "missing"
        would let a corrupt namespace pass the startup validator.
        """
        ns = self._get_namespace(namespace)
        try:
            if not await ns.exists():
                return None
        except NotFoundError:
            return None

        try:
            schema = await ns.schema()
        except NotFoundError:
            return None

        vector_attr = schema.get("vector")
        if vector_attr is None:
            raise VectorStoreError(
                f"Turbopuffer namespace {namespace!r} exists but its schema"
                + " has no 'vector' attribute; cannot probe dim."
            )
        type_str = str(vector_attr.type)
        match = re.search(r"\[(\d+)\]", type_str)
        if match is None:
            raise VectorStoreError(
                f"Turbopuffer namespace {namespace!r} has an unparseable"
                + f" vector type {type_str!r}; expected `[<dim>]<width>`"
                + " (e.g. `[768]f32`). SDK format may have changed."
            )
        return int(match.group(1))
