import hashlib
import logging
from dataclasses import dataclass, field
from pathlib import Path

from rag import chunker, embedder, loader, store
from rag.config import DEFAULT_COLLECTION

logger = logging.getLogger(__name__)

@dataclass
class IndexResult:
    collection: str
    docs_processed: int = 0
    chunks_added: int = 0
    chunks_skipped: int = 0
    errors: list[str] = field(default_factory=list)


def index_documents(
    source: str | Path,
    collection: str = DEFAULT_COLLECTION,
    force: bool = False,
) -> IndexResult:
    result = IndexResult(collection=collection)

    if force:
        store.reset(collection)

    docs = loader.load(source)
    if not docs:
        logger.warning(f"No se encontraron documentos en: {source}")
        return result

    for doc in docs:
        try:
            chunks = chunker.split(doc.text)
            if not chunks:
                logger.warning(f"Sin chunks utilizables en: {doc.source}")
                continue

            embeddings = embedder.embed(chunks)

            ids = [_chunk_id(doc.source, i, text) for i, text in enumerate(chunks)]
            metadatas = [
                {"source": doc.source, "chunk_index": i, **doc.metadata}
                for i in range(len(chunks))
            ]

            added, skipped = store.upsert(collection, ids, embeddings, chunks, metadatas)
            result.docs_processed += 1
            result.chunks_added += added
            result.chunks_skipped += skipped
            logger.info(
                f"Indexado '{doc.source}': {added} chunks nuevos, {skipped} omitidos."
            )

        except Exception as exc:
            msg = f"{doc.source}: {exc}"
            result.errors.append(msg)
            logger.warning(f"Error indexando {msg}")

    return result


def _chunk_id(source: str, chunk_index: int, text: str) -> str:
    key = f"{source}::{chunk_index}::{text[:50]}"
    return hashlib.sha256(key.encode()).hexdigest()[:16]
