Fix pipeline yield: dedup, query expansion, parallel prefilter

- Dedup: mark only accepted picks as seen (not all prefiltered candidates) — unselected items stay eligible for re-evaluation, preventing pool exhaustion across runs - Queries: expanded from 29 to 37+ with rotating 30-subgenre list, 25 label targets, 14 review sites; Bandcamp/MA queries skip time_range for broader results; review sites use time_range:year - Results per query: 15 → 25 - Prefilter: parallel batches of 35 (up to 3 concurrent), processes all fresh candidates instead of just top 80; be-inclusive prompt - Curator: cap 20 → 30, score floor 60 → 50, URL prefix matching in provenance check instead of exact match Result: 405 candidates/run vs 146 before; 88 passing prefilter vs 10; pool stays at ~400 fresh on consecutive runs. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
3 months ago · 39d6051a1f
parent 05bb4193ad
commit 39d6051a1f
1 changed files with 161 additions and 87 deletions
--- a/agent.py
+++ b/agent.py
@ -49,62 +49,120 @@ settings = Settings()
 # ── SEARXNG QUERY BANK ───────────────────────────────────────────────────────
 # All 29 queries — dynamic year, rotating subgenre index
 SUBGENRES = [
    "death metal", "black metal", "doom metal", "thrash metal", "sludge metal",
    "progressive metal", "blackened death metal", "post-metal", "funeral doom", "grindcore",
    "melodic death metal", "technical death metal", "atmospheric black metal", "war metal",
    "drone metal", "stoner metal", "gothic metal", "power metal", "viking metal", "brutal death metal",
    "deathcore", "mathcore", "noise rock", "crust punk", "black doom", "death doom",
    "dissonant death metal", "ambient black metal", "depressive black metal", "crossover thrash",
 ]
 # Labels to search on Bandcamp — no time_range, broad results
 BANDCAMP_LABELS = [
    "sentient ruin",
    "20 buck spin",
    "prosthetic records",
    "unique leader records",
    "redefining darkness",
    "profound lore",
    "season of mist",
    "century media",
    "nuclear blast",
    "relapse records",
    "metal blade records",
    "dark descent records",
    "iron bonehead",
    "les acteurs de l'ombre",
    "vrasubatlat",
    "me saco un ojo",
    "hells headbangers",
    "invictus productions",
    "blood harvest",
    "chaos records",
    "adagio 830",
    "floga records",
    "sewer rot records",
    "lavadome productions",
    "memento mori",
 ]
 # Review/news sites — use time_range year to get fresh but not stale
 REVIEW_SITES = [
    "metalinjection.net",
    "cvltnation.com",
    "decibelmag.com",
    "invisibleoranges.com",
    "brooklynvegan.com",
    "heavyblogisheavy.com",
    "nocturnalcult.com",
    "themetalcrypt.com",
    "angrymetalguy.com",
    "sputnikmusic.com",
    "nocleansinging.com",
    "rateyourmusic.com",
    "terrorizer.com",
    "kerrang.com",
 ]
 def build_queries() -> list[str]:
    year = datetime.now().year
-    # rotate subgenre every 6 hours so successive runs hit different niches
+    # Rotate through 6 subgenres per run (changes every 4 hours)
-    idx = int(time.time() // 21600) % len(SUBGENRES)
+    idx = int(time.time() // 14400) % len(SUBGENRES)
-    sg = SUBGENRES[idx]
+    sgs = [SUBGENRES[(idx + i) % len(SUBGENRES)] for i in range(6)]
-    sg2 = SUBGENRES[(idx + 1) % len(SUBGENRES)]
+    # Rotate through label batches (every 6 hours, 5 labels at a time)
-    sg3 = SUBGENRES[(idx + 2) % len(SUBGENRES)]
+    label_idx = int(time.time() // 21600) % len(BANDCAMP_LABELS)
-    return [
+    labels = [BANDCAMP_LABELS[(label_idx + i) % len(BANDCAMP_LABELS)] for i in range(5)]
-        # Subgenre new releases
+    # Rotate review sites (every 12 hours, 4 sites at a time)
-        f'{sg} new album release {year}',
+    site_idx = int(time.time() // 43200) % len(REVIEW_SITES)
-        f'{sg2} new album {year}',
+    sites = [REVIEW_SITES[(site_idx + i) % len(REVIEW_SITES)] for i in range(4)]
-        f'{sg3} full album stream {year}',
+
-        # Bandcamp label searches
+    queries = []
-        f'site:bandcamp.com "sentient ruin" new {year}',
+
-        f'site:bandcamp.com "20 buck spin" new {year}',
+    # Subgenre releases — mix of year-scoped and unscoped for breadth
-        f'site:bandcamp.com "prosthetic records" {year}',
+    for sg in sgs[:3]:
-        f'site:bandcamp.com "unique leader records" {year}',
+        queries.append(f'{sg} new album release {year}')
-        f'site:bandcamp.com "redefining darkness" {year}',
+    for sg in sgs[3:]:
-        f'site:bandcamp.com "profound lore" {year}',
+        queries.append(f'{sg} new album bandcamp')
-        f'site:bandcamp.com "season of mist" {year}',
+
-        f'site:bandcamp.com "century media" {year}',
+    # Bandcamp direct — no time_range, catches more
-        f'site:bandcamp.com "nuclear blast" {year}',
+    for sg in sgs[:2]:
-        # Bandcamp genre pages
+        queries.append(f'site:bandcamp.com {sg} album {year}')
-        f'site:bandcamp.com {sg} album {year}',
+        queries.append(f'site:bandcamp.com {sg} new release')
-        f'site:bandcamp.com {sg2} album {year}',
+
-        # Metal reviews / coverage
+    # Label searches — broad, no time restriction
-        f'site:metal-archives.com new album review {year}',
+    for label in labels:
-        f'site:metalinjection.net new album {year}',
+        queries.append(f'site:bandcamp.com "{label}"')
-        f'site:cvltnation.com new album {year}',
+
-        f'site:decibelmag.com new album stream {year}',
+    # Review sites with year scope
-        f'site:invisibleoranges.com album review {year}',
+    for site in sites:
-        f'site:brooklynvegan.com {sg} new album {year}',
+        queries.append(f'site:{site} album review {year}')
-        # Obscure/underground
+        queries.append(f'site:{site} new album {year}')
-        f'underground {sg} demo release {year}',
+
-        f'independent {sg} album bandcamp {year}',
+    # Bandcamp underground / demo scene
-        f'new {sg} EP release {year}',
+    for sg in sgs[:2]:
-        # Michigan / regional bias
+        queries.append(f'underground {sg} demo {year} bandcamp')
-        f'Michigan metal band new album {year}',
+        queries.append(f'new {sg} EP {year} bandcamp')
-        f'Detroit metal band new release {year}',
+
-        # Best-of / roundup (useful for discovery)
+    # Michigan / regional
-        f'best {sg} albums {year}',
+    queries.append(f'Michigan metal band new album {year}')
-        f'new {sg} releases {year} bandcamp',
+    queries.append(f'Detroit metal band new release {year}')
-        # Last.fm tag pages
+    queries.append(f'Midwest metal new release {year} bandcamp')
-        f'site:last.fm {sg} new {year}',
+
-        # Full-album YouTube
+    # Metal Archives new reviews (no time_range — MA is evergreen)
-        f'"{sg}" "full album" {year} new band',
+    queries.append(f'site:metal-archives.com album review {year}')
-    ]
+    queries.append(f'site:metal-archives.com new band {year}')
    # Broad discovery
    queries.append(f'best underground metal albums {year}')
    queries.append(f'new metal releases {year} bandcamp full stream')
    queries.append(f'hidden gem metal album {year}')
    queries.append(f'obscure metal release {year}')
    queries.append(f'metal full album stream {year} new band')
    return queries
 # ── NOISE FILTERS ────────────────────────────────────────────────────────────
@ -255,16 +313,22 @@ async def search_candidates(client: httpx.AsyncClient) -> list[dict]:
    log.info(f"Running {len(queries)} SearXNG queries")
    async def one_query(q: str) -> list[dict]:
        # Use year time_range only for queries that include the current year literal;
        # skip time_range for Bandcamp label/genre queries to maximise breadth
        use_time_range = "site:bandcamp.com" not in q and "metal-archives" not in q
        params: dict = {"q": q, "format": "json"}
        if use_time_range:
            params["time_range"] = "year"
        try:
            r = await client.get(
                f"{settings.searxng_url}/search",
-                params={"q": q, "format": "json", "time_range": "month"},
+                params=params,
-                timeout=20,
+                timeout=25,
            )
            if r.status_code != 200:
                return []
            data = r.json()
-            results = data.get("results", [])[:15]  # top 15 per query
+            results = data.get("results", [])[:25]  # top 25 per query
            out = []
            for res in results:
                title = res.get("title", "")
@ -363,56 +427,41 @@ async def score_candidates(
    return scored
-async def prefilter_candidates(
+async def prefilter_batch(
-    client: httpx.AsyncClient, candidates: list[dict]
+    client: httpx.AsyncClient, batch: list[dict], batch_offset: int
 ) -> list[dict]:
-    """Use mistral-nemo to prefilter — keep actual album releases, drop noise."""
+    """Run prefilter on one batch. batch_offset adjusts index for provenance mapping."""
    if not candidates:
        return []
    # Send in batches of 40
    batch = candidates[:80]
    items_for_prompt = [
        {"index": i, "artist": c.get("artist", ""), "album": c.get("album", ""), "url": c.get("url", "")}
        for i, c in enumerate(batch)
    ]
    prompt = f"""You are a metal music expert. Review these {len(items_for_prompt)} items from web searches.
-Return ONLY a JSON array of items that are actual metal album or EP releases (not news articles, not Wikipedia, not lists).
+Return ONLY a JSON array of items that are actual metal album or EP releases (not news, not Wikipedia, not lists, not tour dates).
-For each item kept, include: index (integer), artist (string), album (string), subgenre (string), confidence (0.0-1.0).
+For each kept item include: index (integer), artist (string), album (string), subgenre (string), confidence (0.0-1.0).
 Be inclusive — keep anything that looks like it could be a real release even if you're not certain.
 Items:
 {json.dumps(items_for_prompt, indent=2)}
 Return ONLY valid JSON array starting with ["""
    body = {
        "model": "mistral-nemo:latest",
        "messages": [{"role": "user", "content": prompt}],
        "stream": False,
-        "options": {"temperature": 0.1, "num_predict": 4000},
+        "options": {"temperature": 0.1, "num_predict": 6000},
    }
    try:
-        r = await client.post(
+        r = await client.post(f"{settings.ollama_url}/api/chat", json=body, timeout=150)
            f"{settings.ollama_url}/api/chat",
            json=body,
            timeout=120,
        )
        raw = r.json().get("message", {}).get("content", "") if r.status_code == 200 else ""
    except Exception as e:
-        log.error(f"Prefilter failed: {e}")
+        log.warning(f"Prefilter batch failed: {e}")
-        return candidates[:40]  # fall through with top candidates
+        return batch  # pass through on error
    # Extract JSON from response
    parsed = extract_json_array(raw)
    if not parsed:
-        log.warning("Prefilter returned no parseable JSON — using all candidates")
+        log.warning(f"Prefilter batch {batch_offset}: no JSON — passing through")
-        return candidates[:40]
+        return batch
    log.info(f"Prefilter kept {len(parsed)}/{len(batch)} candidates")
    # Map back to full candidate objects with prefilter data
    enriched = []
    for p in parsed:
        idx = p.get("index")
@ -426,10 +475,35 @@ Return ONLY valid JSON array starting with ["""
            "confidence": p.get("confidence", 0.5),
        }
        enriched.append(orig)
    return enriched
 async def prefilter_candidates(
    client: httpx.AsyncClient, candidates: list[dict]
 ) -> list[dict]:
    """Run prefilter on all candidates in parallel batches of 35."""
    if not candidates:
        return []
    BATCH_SIZE = 35
    batches = [candidates[i:i+BATCH_SIZE] for i in range(0, min(len(candidates), 200), BATCH_SIZE)]
    log.info(f"Prefiltering {len(candidates)} candidates in {len(batches)} parallel batches")
    # Run up to 3 batches concurrently to avoid OOMing Ollama
    all_enriched = []
    for chunk_start in range(0, len(batches), 3):
        chunk = batches[chunk_start:chunk_start+3]
        results = await asyncio.gather(*[
            prefilter_batch(client, b, chunk_start * BATCH_SIZE + i * BATCH_SIZE)
            for i, b in enumerate(chunk)
        ])
        for r in results:
            all_enriched.extend(r)
    log.info(f"Prefilter kept {len(all_enriched)}/{len(candidates)} candidates")
    return all_enriched
 async def curate_picks(
    client: httpx.AsyncClient, candidates: list[dict], taste: dict
 ) -> list[dict]:
@ -437,7 +511,7 @@ async def curate_picks(
    if not candidates:
        return []
-    top = sorted(candidates, key=lambda x: -x.get("embedScore", 0))[:20]
+    top = sorted(candidates, key=lambda x: -x.get("embedScore", 0))[:30]
    top_artists = [a["name"] for a in taste.get("topArtists", [])[:20]]
    recent_artists = [a["name"] for a in taste.get("recentArtists", [])[:10]]
    year = datetime.now().year
@ -497,19 +571,19 @@ Return ONLY a JSON array starting with ["""
            raw = r.json().get("message", {}).get("content", "") if r.status_code == 200 else ""
            picks = extract_json_array(raw)
            if picks:
-                # Validate provenance: url must match a search result OR index must be valid
+                # Validate provenance: keep picks that reference a real input item
                validated = []
                for p in picks:
-                    if p.get("score", 0) < 60:
+                    if p.get("score", 0) < 50:
                        continue
                    idx = p.get("index")
                    url = p.get("url", "")
                    if idx is not None and idx in top_by_index:
-                        # Merge URL from original candidate if LLM dropped it
+                        # Always use original URL from search result
-                        if not url:
+                        p["url"] = top_by_index[idx].get("url", url)
                            p["url"] = top_by_index[idx].get("url", "")
                        validated.append(p)
-                    elif url and url in known_urls:
+                    elif url and any(url.startswith(ku[:40]) for ku in known_urls if ku):
                        # URL prefix match (handles trailing-slash variants)
                        validated.append(p)
                    else:
                        log.debug(f"Dropped hallucinated pick: {p.get('artist')} — {p.get('album')}")
@ -650,11 +724,11 @@ async def pipeline_run(run_id: str):
            log_step("Curating picks with Mistral")
            picks = await curate_picks(client, prefiltered, taste)
-            log_step(f"{len(picks)} picks scored ≥60")
+            log_step(f"{len(picks)} picks scored ≥50")
            if picks:
-                # Mark all prefiltered candidates as seen (not just picks) to avoid re-processing
+                # Mark only actual picks as seen — unselected candidates stay eligible for re-eval
-                await mark_seen(prefiltered)
+                await mark_seen(picks)
                await write_picks(picks, {
                    "candidates": len(candidates),
                    "filtered": len(prefiltered),