"""
fgs-agent — First Gain Stage metal discovery pipeline
Standalone FastAPI service; n8n just POSTs to /api/fgs/run on a schedule.

Endpoints:
  POST /api/fgs/run      — trigger a discovery run (async, returns run_id)
  GET  /api/fgs/status   — current run status
  GET  /api/fgs/picks    — read metal-picks.json
  GET  /api/fgs/health   — liveness check
"""

import asyncio
import json
import logging
import math
import re
import time
import uuid
from datetime import datetime, timezone
from typing import Any, Optional

import aiosqlite
import httpx
import uvicorn
from fastapi import BackgroundTasks, FastAPI
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from pydantic_settings import BaseSettings, SettingsConfigDict

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger("fgs-agent")


# ── Config ──────────────────────────────────────────────────────────────────

class Settings(BaseSettings):
    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")

    lastfm_username: str = "nick2day"
    lastfm_api_key: str = ""
    searxng_url: str = "http://localhost:8080"
    ollama_url: str = "http://localhost:11434"
    picks_path: str = "/home/nick2day/.n8n/metal-picks.json"
    dedup_db: str = "/home/nick2day/fgs-agent/dedup.db"
    port: int = 8766


settings = Settings()


# ── SEARXNG QUERY BANK ───────────────────────────────────────────────────────
# All 29 queries — dynamic year, rotating subgenre index

SUBGENRES = [
    "death metal", "black metal", "doom metal", "thrash metal", "sludge metal",
    "progressive metal", "blackened death metal", "post-metal", "funeral doom", "grindcore",
    "melodic death metal", "technical death metal", "atmospheric black metal", "war metal",
    "drone metal", "stoner metal", "gothic metal", "power metal", "viking metal", "brutal death metal",
]

def build_queries() -> list[str]:
    year = datetime.now().year
    # rotate subgenre every 6 hours so successive runs hit different niches
    idx = int(time.time() // 21600) % len(SUBGENRES)
    sg = SUBGENRES[idx]
    sg2 = SUBGENRES[(idx + 1) % len(SUBGENRES)]
    sg3 = SUBGENRES[(idx + 2) % len(SUBGENRES)]
    return [
        # Subgenre new releases
        f'{sg} new album release {year}',
        f'{sg2} new album {year}',
        f'{sg3} full album stream {year}',
        # Bandcamp label searches
        f'site:bandcamp.com "sentient ruin" new {year}',
        f'site:bandcamp.com "20 buck spin" new {year}',
        f'site:bandcamp.com "prosthetic records" {year}',
        f'site:bandcamp.com "unique leader records" {year}',
        f'site:bandcamp.com "redefining darkness" {year}',
        f'site:bandcamp.com "profound lore" {year}',
        f'site:bandcamp.com "season of mist" {year}',
        f'site:bandcamp.com "century media" {year}',
        f'site:bandcamp.com "nuclear blast" {year}',
        # Bandcamp genre pages
        f'site:bandcamp.com {sg} album {year}',
        f'site:bandcamp.com {sg2} album {year}',
        # Metal reviews / coverage
        f'site:metal-archives.com new album review {year}',
        f'site:metalinjection.net new album {year}',
        f'site:cvltnation.com new album {year}',
        f'site:decibelmag.com new album stream {year}',
        f'site:invisibleoranges.com album review {year}',
        f'site:brooklynvegan.com {sg} new album {year}',
        # Obscure/underground
        f'underground {sg} demo release {year}',
        f'independent {sg} album bandcamp {year}',
        f'new {sg} EP release {year}',
        # Michigan / regional bias
        f'Michigan metal band new album {year}',
        f'Detroit metal band new release {year}',
        # Best-of / roundup (useful for discovery)
        f'best {sg} albums {year}',
        f'new {sg} releases {year} bandcamp',
        # Last.fm tag pages
        f'site:last.fm {sg} new {year}',
        # Full-album YouTube
        f'"{sg}" "full album" {year} new band',
    ]


# ── NOISE FILTERS ────────────────────────────────────────────────────────────

NOISE_PATTERNS = [
    re.compile(r'wikipedia\.org', re.I),
    re.compile(r'\bin heavy metal music\b', re.I),
    re.compile(r'album release calendar', re.I),
    re.compile(r'most anticipated', re.I),
    re.compile(r'\byear in review\b', re.I),
    re.compile(r'^top \d+', re.I),
    re.compile(r'albums you need to hear', re.I),
    re.compile(r'upcoming.*releases', re.I),
    re.compile(r'\bto release\b.*album', re.I),
    re.compile(r'\bannounce[sd]?\b.*album', re.I),
    re.compile(r'\bdelayed\b', re.I),
    re.compile(r'best.*releases', re.I),
    re.compile(r'\d+ metal albums.*20\d\d', re.I),
    re.compile(r'shatner', re.I),
    re.compile(r'mark your calendars', re.I),
    re.compile(r'\bvideo game\b', re.I),
    re.compile(r'\bsoundtrack\b', re.I),
    re.compile(r'\btribute\b', re.I),
    re.compile(r'\bcovers album\b', re.I),
    re.compile(r'heavy metal music - wikipedia', re.I),
    re.compile(r'in \d{4} - wikipedia', re.I),
]


def is_noise(title: str, url: str) -> bool:
    text = f"{title} {url}"
    return any(p.search(text) for p in NOISE_PATTERNS)


def parse_title(title: str) -> tuple[str, str]:
    """Extract (artist, album) from common title formats."""
    # "Album Title, by Artist Name" — Bandcamp format
    m = re.match(r'^(.+?),\s+by\s+(.+)$', title, re.I)
    if m:
        return m.group(2).strip(), m.group(1).strip()
    # "Artist - Album" or "Artist – Album"
    m = re.match(r'^([^-–]+?)\s*[-–]\s*(.+)$', title)
    if m and len(m.group(1)) < 60:
        return m.group(1).strip(), m.group(2).strip()
    return '', title.strip()


def artist_from_url(url: str) -> str:
    """Extract artist slug from Bandcamp URL."""
    m = re.match(r'https?://([^.]+)\.bandcamp\.com', url, re.I)
    if m:
        return m.group(1).replace('-', ' ')
    return ''


# ── DEDUP ────────────────────────────────────────────────────────────────────

async def init_dedup_db():
    async with aiosqlite.connect(settings.dedup_db) as db:
        await db.execute(
            "CREATE TABLE IF NOT EXISTS seen (key TEXT PRIMARY KEY, ts INTEGER)"
        )
        await db.commit()


async def filter_unseen(candidates: list[dict]) -> list[dict]:
    """Remove already-seen albums. Returns only fresh candidates."""
    async with aiosqlite.connect(settings.dedup_db) as db:
        fresh = []
        for c in candidates:
            key = f"{c.get('artist','').lower()}::{c.get('album','').lower()}"
            if not key or key == '::':
                key = c.get('url', c.get('album', ''))
            row = await db.execute("SELECT 1 FROM seen WHERE key=?", (key,))
            if not await row.fetchone():
                fresh.append(c)
        return fresh


async def mark_seen(picks: list[dict]):
    async with aiosqlite.connect(settings.dedup_db) as db:
        ts = int(time.time())
        for p in picks:
            key = f"{p.get('artist','').lower()}::{p.get('album','').lower()}"
            if key and key != '::':
                await db.execute(
                    "INSERT OR IGNORE INTO seen (key, ts) VALUES (?, ?)", (key, ts)
                )
        await db.commit()


# ── PIPELINE STAGES ──────────────────────────────────────────────────────────

async def fetch_lastfm(client: httpx.AsyncClient) -> dict:
    """Fetch top artists + recent tracks from Last.fm."""
    base = "https://ws.audioscrobbler.com/2.0/"
    params_top = {
        "method": "user.gettopartists",
        "user": settings.lastfm_username,
        "api_key": settings.lastfm_api_key,
        "period": "12month",
        "limit": "100",
        "format": "json",
    }
    params_recent = {
        "method": "user.getrecenttracks",
        "user": settings.lastfm_username,
        "api_key": settings.lastfm_api_key,
        "limit": "1000",
        "format": "json",
    }
    top_r, recent_r = await asyncio.gather(
        client.get(base, params=params_top, timeout=15),
        client.get(base, params=params_recent, timeout=15),
    )
    return {
        "top": top_r.json() if top_r.status_code == 200 else {},
        "recent": recent_r.json() if recent_r.status_code == 200 else {},
    }


def build_taste_profile(lastfm: dict) -> dict:
    """Build taste centroid from Last.fm data."""
    top_artists_raw = lastfm.get("top", {}).get("topartists", {}).get("artist", [])
    recent_tracks_raw = lastfm.get("recent", {}).get("recenttracks", {}).get("track", [])

    top_artists = [
        {"name": a["name"], "playcount": int(a.get("playcount", 0)), "rank": int(a.get("@attr", {}).get("rank", 99))}
        for a in top_artists_raw[:100]
    ]

    recent_map: dict[str, int] = {}
    for t in recent_tracks_raw:
        name = (t.get("artist") or {}).get("#text", "")
        if name:
            recent_map[name] = recent_map.get(name, 0) + 1
    recent_artists = sorted(recent_map.items(), key=lambda x: -x[1])[:20]

    return {
        "topArtists": top_artists,
        "recentArtists": [{"name": n, "plays": p} for n, p in recent_artists],
    }


async def search_candidates(client: httpx.AsyncClient) -> list[dict]:
    """Run all SearXNG queries concurrently, collect and deduplicate results."""
    queries = build_queries()
    log.info(f"Running {len(queries)} SearXNG queries")

    async def one_query(q: str) -> list[dict]:
        try:
            r = await client.get(
                f"{settings.searxng_url}/search",
                params={"q": q, "format": "json", "time_range": "month"},
                timeout=20,
            )
            if r.status_code != 200:
                return []
            data = r.json()
            results = data.get("results", [])[:15]  # top 15 per query
            out = []
            for res in results:
                title = res.get("title", "")
                url = res.get("url", "")
                if is_noise(title, url):
                    continue
                artist, album = parse_title(title)
                if not artist:
                    artist = artist_from_url(url)
                out.append({
                    "title": title,
                    "url": url,
                    "artist": artist,
                    "album": album,
                    "source": q[:60],
                    "content": (res.get("content") or "")[:200],
                })
            return out
        except Exception as e:
            log.warning(f"SearXNG query failed: {q[:40]} — {e}")
            return []

    results = await asyncio.gather(*[one_query(q) for q in queries])
    all_results = [r for batch in results for r in batch]

    # Deduplicate by URL
    seen_urls: set[str] = set()
    unique = []
    for r in all_results:
        u = r["url"]
        if u and u not in seen_urls:
            seen_urls.add(u)
            unique.append(r)

    log.info(f"Collected {len(unique)} unique candidates from search")
    return unique


async def embed_text(client: httpx.AsyncClient, text: str) -> list[float]:
    try:
        r = await client.post(
            f"{settings.ollama_url}/api/embeddings",
            json={"model": "nomic-embed-text", "prompt": text},
            timeout=30,
        )
        if r.status_code == 200:
            return r.json().get("embedding", [])
    except Exception as e:
        log.warning(f"Embed failed: {e}")
    return []


def cosine_similarity(a: list[float], b: list[float]) -> float:
    if not a or not b or len(a) != len(b):
        return 0.0
    dot = sum(x * y for x, y in zip(a, b))
    mag_a = math.sqrt(sum(x * x for x in a))
    mag_b = math.sqrt(sum(x * x for x in b))
    if mag_a == 0 or mag_b == 0:
        return 0.0
    return dot / (mag_a * mag_b)


async def score_candidates(
    client: httpx.AsyncClient, candidates: list[dict], taste: dict
) -> list[dict]:
    """Embed candidates and score against taste centroid."""
    if not candidates:
        return []

    # Build taste centroid from top artists
    top_names = [a["name"] for a in taste.get("topArtists", [])[:30]]
    centroid_text = "Heavy metal artist: " + ", ".join(top_names)

    log.info("Embedding taste centroid")
    centroid_vec = await embed_text(client, centroid_text)

    if not centroid_vec:
        log.warning("Centroid embedding failed — scoring by 0")
        for c in candidates:
            c["embedScore"] = 0.0
        return candidates

    log.info(f"Scoring {len(candidates)} candidates against taste centroid")
    scored = []
    # Batch embed in groups to avoid hammering Ollama
    for i, c in enumerate(candidates):
        text = f"Metal album: {c['artist']} - {c['album']}" if c.get("artist") else f"Metal album: {c['album']}"
        vec = await embed_text(client, text)
        c["embedScore"] = round(cosine_similarity(centroid_vec, vec), 4) if vec else 0.0
        if i % 20 == 0:
            log.info(f"  scored {i}/{len(candidates)}")
        scored.append(c)

    scored.sort(key=lambda x: -x.get("embedScore", 0))
    return scored


async def prefilter_candidates(
    client: httpx.AsyncClient, candidates: list[dict]
) -> list[dict]:
    """Use mistral-nemo to prefilter — keep actual album releases, drop noise."""
    if not candidates:
        return []

    # Send in batches of 40
    batch = candidates[:80]
    items_for_prompt = [
        {"index": i, "artist": c.get("artist", ""), "album": c.get("album", ""), "url": c.get("url", "")}
        for i, c in enumerate(batch)
    ]

    prompt = f"""You are a metal music expert. Review these {len(items_for_prompt)} items from web searches.
Return ONLY a JSON array of items that are actual metal album or EP releases (not news articles, not Wikipedia, not lists).
For each item kept, include: index (integer), artist (string), album (string), subgenre (string), confidence (0.0-1.0).

Items:
{json.dumps(items_for_prompt, indent=2)}

Return ONLY valid JSON array starting with ["""

    body = {
        "model": "mistral-nemo:latest",
        "messages": [{"role": "user", "content": prompt}],
        "stream": False,
        "options": {"temperature": 0.1, "num_predict": 4000},
    }

    try:
        r = await client.post(
            f"{settings.ollama_url}/api/chat",
            json=body,
            timeout=120,
        )
        raw = r.json().get("message", {}).get("content", "") if r.status_code == 200 else ""
    except Exception as e:
        log.error(f"Prefilter failed: {e}")
        return candidates[:40]  # fall through with top candidates

    # Extract JSON from response
    parsed = extract_json_array(raw)
    if not parsed:
        log.warning("Prefilter returned no parseable JSON — using all candidates")
        return candidates[:40]

    log.info(f"Prefilter kept {len(parsed)}/{len(batch)} candidates")

    # Map back to full candidate objects with prefilter data
    enriched = []
    for p in parsed:
        idx = p.get("index")
        if idx is None or idx >= len(batch):
            continue
        orig = batch[idx].copy()
        orig["artist"] = p.get("artist") or orig.get("artist", "")
        orig["album"] = p.get("album") or orig.get("album", "")
        orig["prefilter"] = {
            "subgenre": p.get("subgenre", "unknown"),
            "confidence": p.get("confidence", 0.5),
        }
        enriched.append(orig)

    return enriched


async def curate_picks(
    client: httpx.AsyncClient, candidates: list[dict], taste: dict
) -> list[dict]:
    """Use mistral-nemo to produce final scored picks with full metadata."""
    if not candidates:
        return []

    top = sorted(candidates, key=lambda x: -x.get("embedScore", 0))[:20]
    top_artists = [a["name"] for a in taste.get("topArtists", [])[:20]]
    recent_artists = [a["name"] for a in taste.get("recentArtists", [])[:10]]
    year = datetime.now().year

    items_for_prompt = [
        {
            "index": i,
            "artist": c.get("artist", ""),
            "album": c.get("album", ""),
            "subgenre": (c.get("prefilter") or {}).get("subgenre", "unknown"),
            "url": c.get("url", ""),
            "source": c.get("source", ""),
        }
        for i, c in enumerate(top)
    ]

    prompt = f"""You are a heavy metal curator reviewing NEW {year} releases found via web search.

IMPORTANT: Only score albums from the list below. Do NOT recommend albums not in this list.
Do NOT recommend classic albums. These must be actual {year} releases found at the provided URLs.
If you are not sure an item is a real new release, skip it.

nick2day's taste profile:
- Top artists (12-month): {', '.join(top_artists)}
- Recent listens: {', '.join(recent_artists)}

New {year} releases to evaluate (use ONLY these):
{json.dumps(items_for_prompt, indent=2)}

For each item worth recommending, output a JSON object with:
  index (integer, from above), artist, album, score (0-100, based on fit to taste profile),
  subgenre, obscurity (underground/indie/major), similar_to (2-3 artists from taste profile),
  why (1 sentence), review_angle (unique angle), tags (string array), url (copy from above)

Skip items without a clear artist + album. Skip classics or non-{year} releases.
Return ONLY a JSON array starting with ["""

    body = {
        "model": "mistral-nemo:latest",
        "messages": [{"role": "user", "content": prompt}],
        "stream": False,
        "options": {"temperature": 0.3, "num_predict": 8000},
    }

    # Build index → candidate map for provenance validation
    top_by_index = {i: c for i, c in enumerate(top)}
    known_urls = {c.get("url", "") for c in top if c.get("url")}

    raw = ""
    for attempt in range(3):
        try:
            r = await client.post(
                f"{settings.ollama_url}/api/chat",
                json=body,
                timeout=180,
            )
            raw = r.json().get("message", {}).get("content", "") if r.status_code == 200 else ""
            picks = extract_json_array(raw)
            if picks:
                # Validate provenance: url must match a search result OR index must be valid
                validated = []
                for p in picks:
                    if p.get("score", 0) < 60:
                        continue
                    idx = p.get("index")
                    url = p.get("url", "")
                    if idx is not None and idx in top_by_index:
                        # Merge URL from original candidate if LLM dropped it
                        if not url:
                            p["url"] = top_by_index[idx].get("url", "")
                        validated.append(p)
                    elif url and url in known_urls:
                        validated.append(p)
                    else:
                        log.debug(f"Dropped hallucinated pick: {p.get('artist')} — {p.get('album')}")
                log.info(f"Curator returned {len(validated)} validated picks (attempt {attempt+1})")
                if validated:
                    return validated
        except Exception as e:
            log.error(f"Curator attempt {attempt+1} failed: {e}")
        await asyncio.sleep(2)

    log.warning("Curator failed all attempts")
    return []


def extract_json_array(text: str) -> list:
    """Robustly extract a JSON array from LLM output."""
    if not text:
        return []
    # Strip markdown fences
    cleaned = re.sub(r'```(?:json)?\s*', '', text).strip()
    # Try direct parse
    try:
        data = json.loads(cleaned)
        if isinstance(data, list):
            return data
        if isinstance(data, dict):
            for v in data.values():
                if isinstance(v, list):
                    return v
    except Exception:
        pass
    # Find first [ to last ]
    start = cleaned.find('[')
    end = cleaned.rfind(']')
    if start != -1 and end > start:
        try:
            data = json.loads(cleaned[start:end+1])
            if isinstance(data, list):
                return data
        except Exception:
            pass
    return []


async def write_picks(new_picks: list[dict], run_stats: dict):
    """Append new picks to metal-picks.json."""
    try:
        with open(settings.picks_path, "r") as f:
            store = json.load(f)
    except Exception:
        store = {"allPicks": [], "runs": [], "lastUpdated": ""}

    # Add run record
    run = {
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "source": "fgs-agent",
        "picksAdded": len(new_picks),
        "candidatesSearched": run_stats.get("candidates", 0),
        "candidatesFiltered": run_stats.get("filtered", 0),
    }
    store["runs"] = ([run] + store.get("runs", []))[:10]
    store["allPicks"] = (new_picks + store.get("allPicks", []))[:200]
    store["lastUpdated"] = run["timestamp"]

    with open(settings.picks_path, "w") as f:
        json.dump(store, f, indent=2)

    log.info(f"Wrote {len(new_picks)} new picks to {settings.picks_path}")


# ── RUN STATE ────────────────────────────────────────────────────────────────

run_state: dict[str, Any] = {
    "status": "idle",
    "run_id": None,
    "started": None,
    "finished": None,
    "picks_added": 0,
    "candidates_found": 0,
    "error": None,
    "log": [],
}
_run_lock = asyncio.Lock()


async def pipeline_run(run_id: str):
    global run_state

    def log_step(msg: str):
        log.info(msg)
        run_state["log"].append({"ts": datetime.now(timezone.utc).isoformat(), "msg": msg})

    async with _run_lock:
        run_state.update({
            "status": "running",
            "run_id": run_id,
            "started": datetime.now(timezone.utc).isoformat(),
            "finished": None,
            "picks_added": 0,
            "candidates_found": 0,
            "error": None,
            "log": [],
        })

    try:
        async with httpx.AsyncClient() as client:
            log_step("Fetching Last.fm data")
            lastfm = await fetch_lastfm(client)
            taste = build_taste_profile(lastfm)
            log_step(f"Taste profile: {len(taste['topArtists'])} top artists, {len(taste['recentArtists'])} recent")

            log_step("Searching candidates")
            candidates = await search_candidates(client)
            run_state["candidates_found"] = len(candidates)

            if not candidates:
                log_step("No candidates found — ending run")
                run_state["status"] = "done"
                run_state["finished"] = datetime.now(timezone.utc).isoformat()
                return

            log_step("Filtering seen albums")
            fresh = await filter_unseen(candidates)
            log_step(f"{len(fresh)} fresh candidates (of {len(candidates)} total)")

            if not fresh:
                log_step("All candidates already seen — ending run")
                run_state["status"] = "done"
                run_state["finished"] = datetime.now(timezone.utc).isoformat()
                return

            log_step("Scoring by embedding similarity")
            scored = await score_candidates(client, fresh, taste)

            log_step("Prefiltering with Mistral")
            prefiltered = await prefilter_candidates(client, scored)
            log_step(f"{len(prefiltered)} candidates passed prefilter")

            log_step("Curating picks with Mistral")
            picks = await curate_picks(client, prefiltered, taste)
            log_step(f"{len(picks)} picks scored ≥60")

            if picks:
                # Mark all prefiltered candidates as seen (not just picks) to avoid re-processing
                await mark_seen(prefiltered)
                await write_picks(picks, {
                    "candidates": len(candidates),
                    "filtered": len(prefiltered),
                })

            run_state.update({
                "status": "done",
                "finished": datetime.now(timezone.utc).isoformat(),
                "picks_added": len(picks),
            })
            log_step(f"Run complete — {len(picks)} new picks added")

    except Exception as e:
        log.exception(f"Pipeline run failed: {e}")
        run_state.update({
            "status": "error",
            "finished": datetime.now(timezone.utc).isoformat(),
            "error": str(e),
        })


# ── API ───────────────────────────────────────────────────────────────────────

app = FastAPI(title="fgs-agent", description="First Gain Stage metal discovery")


@app.on_event("startup")
async def startup():
    await init_dedup_db()
    log.info(f"fgs-agent started on port {settings.port}")


@app.post("/api/fgs/run")
async def trigger_run(background_tasks: BackgroundTasks):
    if run_state["status"] == "running":
        return JSONResponse({"status": "already_running", "run_id": run_state["run_id"]}, status_code=409)
    run_id = str(uuid.uuid4())[:8]
    background_tasks.add_task(pipeline_run, run_id)
    return {"status": "started", "run_id": run_id}


@app.get("/api/fgs/status")
async def get_status():
    return run_state


@app.get("/api/fgs/picks")
async def get_picks():
    try:
        with open(settings.picks_path, "r") as f:
            return json.load(f)
    except Exception as e:
        return JSONResponse({"error": str(e)}, status_code=500)


@app.get("/api/fgs/health")
async def health():
    return {"status": "ok", "agent": "fgs-agent"}


# ── Entry point ───────────────────────────────────────────────────────────────

if __name__ == "__main__":
    uvicorn.run("agent:app", host="0.0.0.0", port=settings.port, reload=False)