import logging
from dataclasses import dataclass, field
from pathlib import Path

from rag.config import SUPPORTED_EXTENSIONS

logger = logging.getLogger(__name__)

try:
    import PyPDF2

    _PDF_AVAILABLE = True
except ImportError:
    _PDF_AVAILABLE = False


@dataclass
class Document:
    text: str
    source: str
    path: Path
    metadata: dict = field(default_factory=dict)


def load(source: str | Path) -> list[Document]:
    path = Path(source)
    if not path.exists():
        logger.warning("Fuente no encontrada: %s", path)
        return []

    if path.is_dir():
        docs: list[Document] = []
        for child in sorted(path.rglob("*")):
            if child.is_file() and child.suffix.lower() in SUPPORTED_EXTENSIONS:
                docs.extend(_load_file(child))
        return docs

    if path.is_file():
        if path.suffix.lower() not in SUPPORTED_EXTENSIONS:
            logger.warning(f"Extensión no soportada: {path.suffix}")
            return []
        return _load_file(path)

    logger.warning(f"No es archivo ni directorio: {path}")
    return []


def _load_file(path: Path) -> list[Document]:
    ext = path.suffix.lower()
    try:
        if ext == ".pdf":
            return _load_pdf(path)
        return _load_text(path)
    except Exception as exc:
        logger.error(f"Error leyendo {path}: {exc}")
        return []


def _load_text(path: Path) -> list[Document]:
    for encoding in ("utf-8", "latin-1"):
        try:
            text = path.read_text(encoding=encoding)
            return [_make_doc(text, path)]
        except UnicodeDecodeError:
            continue
    logger.error(f"No se pudo decodificar {path} (utf-8 ni latin-1)")
    return []


def _load_pdf(path: Path) -> list[Document]:
    if not _PDF_AVAILABLE:
        logger.warning(
            f"PyPDF2 no está instalado; omitiendo {path}. "
            "Instálalo con: pip install PyPDF2"
        )
        return []

    pages: list[str] = []
    with open(path, "rb") as fh:
        reader = PyPDF2.PdfReader(fh)
        for i, page in enumerate(reader.pages, start=1):
            text = page.extract_text() or ""
            if text.strip():
                pages.append(f"\n--- página {i} ---\n{text}")

    full_text = "\n".join(pages)
    return [_make_doc(full_text, path)]


def _make_doc(text: str, path: Path) -> Document:
    return Document(
        text=text,
        source=path.name,
        path=path.resolve(),
        metadata={
            "source": path.name,
            "extension": path.suffix.lower(),
            "size_bytes": path.stat().st_size,
        },
    )