Fix pipeline yield: dedup, query expansion, parallel prefilter

- Dedup: mark only accepted picks as seen (not all prefiltered
  candidates) — unselected items stay eligible for re-evaluation,
  preventing pool exhaustion across runs
- Queries: expanded from 29 to 37+ with rotating 30-subgenre list,
  25 label targets, 14 review sites; Bandcamp/MA queries skip
  time_range for broader results; review sites use time_range:year
- Results per query: 15 → 25
- Prefilter: parallel batches of 35 (up to 3 concurrent), processes
  all fresh candidates instead of just top 80; be-inclusive prompt
- Curator: cap 20 → 30, score floor 60 → 50, URL prefix matching
  in provenance check instead of exact match

Result: 405 candidates/run vs 146 before; 88 passing prefilter vs 10;
pool stays at ~400 fresh on consecutive runs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
main
nick2day 3 months ago
parent 05bb4193ad
commit 39d6051a1f

@ -49,62 +49,120 @@ settings = Settings()
# ── SEARXNG QUERY BANK ─────────────────────────────────────────────────────── # ── SEARXNG QUERY BANK ───────────────────────────────────────────────────────
# All 29 queries — dynamic year, rotating subgenre index
SUBGENRES = [ SUBGENRES = [
"death metal", "black metal", "doom metal", "thrash metal", "sludge metal", "death metal", "black metal", "doom metal", "thrash metal", "sludge metal",
"progressive metal", "blackened death metal", "post-metal", "funeral doom", "grindcore", "progressive metal", "blackened death metal", "post-metal", "funeral doom", "grindcore",
"melodic death metal", "technical death metal", "atmospheric black metal", "war metal", "melodic death metal", "technical death metal", "atmospheric black metal", "war metal",
"drone metal", "stoner metal", "gothic metal", "power metal", "viking metal", "brutal death metal", "drone metal", "stoner metal", "gothic metal", "power metal", "viking metal", "brutal death metal",
"deathcore", "mathcore", "noise rock", "crust punk", "black doom", "death doom",
"dissonant death metal", "ambient black metal", "depressive black metal", "crossover thrash",
] ]
# Labels to search on Bandcamp — no time_range, broad results
BANDCAMP_LABELS = [
"sentient ruin",
"20 buck spin",
"prosthetic records",
"unique leader records",
"redefining darkness",
"profound lore",
"season of mist",
"century media",
"nuclear blast",
"relapse records",
"metal blade records",
"dark descent records",
"iron bonehead",
"les acteurs de l'ombre",
"vrasubatlat",
"me saco un ojo",
"hells headbangers",
"invictus productions",
"blood harvest",
"chaos records",
"adagio 830",
"floga records",
"sewer rot records",
"lavadome productions",
"memento mori",
]
# Review/news sites — use time_range year to get fresh but not stale
REVIEW_SITES = [
"metalinjection.net",
"cvltnation.com",
"decibelmag.com",
"invisibleoranges.com",
"brooklynvegan.com",
"heavyblogisheavy.com",
"nocturnalcult.com",
"themetalcrypt.com",
"angrymetalguy.com",
"sputnikmusic.com",
"nocleansinging.com",
"rateyourmusic.com",
"terrorizer.com",
"kerrang.com",
]
def build_queries() -> list[str]: def build_queries() -> list[str]:
year = datetime.now().year year = datetime.now().year
# rotate subgenre every 6 hours so successive runs hit different niches # Rotate through 6 subgenres per run (changes every 4 hours)
idx = int(time.time() // 21600) % len(SUBGENRES) idx = int(time.time() // 14400) % len(SUBGENRES)
sg = SUBGENRES[idx] sgs = [SUBGENRES[(idx + i) % len(SUBGENRES)] for i in range(6)]
sg2 = SUBGENRES[(idx + 1) % len(SUBGENRES)] # Rotate through label batches (every 6 hours, 5 labels at a time)
sg3 = SUBGENRES[(idx + 2) % len(SUBGENRES)] label_idx = int(time.time() // 21600) % len(BANDCAMP_LABELS)
return [ labels = [BANDCAMP_LABELS[(label_idx + i) % len(BANDCAMP_LABELS)] for i in range(5)]
# Subgenre new releases # Rotate review sites (every 12 hours, 4 sites at a time)
f'{sg} new album release {year}', site_idx = int(time.time() // 43200) % len(REVIEW_SITES)
f'{sg2} new album {year}', sites = [REVIEW_SITES[(site_idx + i) % len(REVIEW_SITES)] for i in range(4)]
f'{sg3} full album stream {year}',
# Bandcamp label searches queries = []
f'site:bandcamp.com "sentient ruin" new {year}',
f'site:bandcamp.com "20 buck spin" new {year}', # Subgenre releases — mix of year-scoped and unscoped for breadth
f'site:bandcamp.com "prosthetic records" {year}', for sg in sgs[:3]:
f'site:bandcamp.com "unique leader records" {year}', queries.append(f'{sg} new album release {year}')
f'site:bandcamp.com "redefining darkness" {year}', for sg in sgs[3:]:
f'site:bandcamp.com "profound lore" {year}', queries.append(f'{sg} new album bandcamp')
f'site:bandcamp.com "season of mist" {year}',
f'site:bandcamp.com "century media" {year}', # Bandcamp direct — no time_range, catches more
f'site:bandcamp.com "nuclear blast" {year}', for sg in sgs[:2]:
# Bandcamp genre pages queries.append(f'site:bandcamp.com {sg} album {year}')
f'site:bandcamp.com {sg} album {year}', queries.append(f'site:bandcamp.com {sg} new release')
f'site:bandcamp.com {sg2} album {year}',
# Metal reviews / coverage # Label searches — broad, no time restriction
f'site:metal-archives.com new album review {year}', for label in labels:
f'site:metalinjection.net new album {year}', queries.append(f'site:bandcamp.com "{label}"')
f'site:cvltnation.com new album {year}',
f'site:decibelmag.com new album stream {year}', # Review sites with year scope
f'site:invisibleoranges.com album review {year}', for site in sites:
f'site:brooklynvegan.com {sg} new album {year}', queries.append(f'site:{site} album review {year}')
# Obscure/underground queries.append(f'site:{site} new album {year}')
f'underground {sg} demo release {year}',
f'independent {sg} album bandcamp {year}', # Bandcamp underground / demo scene
f'new {sg} EP release {year}', for sg in sgs[:2]:
# Michigan / regional bias queries.append(f'underground {sg} demo {year} bandcamp')
f'Michigan metal band new album {year}', queries.append(f'new {sg} EP {year} bandcamp')
f'Detroit metal band new release {year}',
# Best-of / roundup (useful for discovery) # Michigan / regional
f'best {sg} albums {year}', queries.append(f'Michigan metal band new album {year}')
f'new {sg} releases {year} bandcamp', queries.append(f'Detroit metal band new release {year}')
# Last.fm tag pages queries.append(f'Midwest metal new release {year} bandcamp')
f'site:last.fm {sg} new {year}',
# Full-album YouTube # Metal Archives new reviews (no time_range — MA is evergreen)
f'"{sg}" "full album" {year} new band', queries.append(f'site:metal-archives.com album review {year}')
] queries.append(f'site:metal-archives.com new band {year}')
# Broad discovery
queries.append(f'best underground metal albums {year}')
queries.append(f'new metal releases {year} bandcamp full stream')
queries.append(f'hidden gem metal album {year}')
queries.append(f'obscure metal release {year}')
queries.append(f'metal full album stream {year} new band')
return queries
# ── NOISE FILTERS ──────────────────────────────────────────────────────────── # ── NOISE FILTERS ────────────────────────────────────────────────────────────
@ -255,16 +313,22 @@ async def search_candidates(client: httpx.AsyncClient) -> list[dict]:
log.info(f"Running {len(queries)} SearXNG queries") log.info(f"Running {len(queries)} SearXNG queries")
async def one_query(q: str) -> list[dict]: async def one_query(q: str) -> list[dict]:
# Use year time_range only for queries that include the current year literal;
# skip time_range for Bandcamp label/genre queries to maximise breadth
use_time_range = "site:bandcamp.com" not in q and "metal-archives" not in q
params: dict = {"q": q, "format": "json"}
if use_time_range:
params["time_range"] = "year"
try: try:
r = await client.get( r = await client.get(
f"{settings.searxng_url}/search", f"{settings.searxng_url}/search",
params={"q": q, "format": "json", "time_range": "month"}, params=params,
timeout=20, timeout=25,
) )
if r.status_code != 200: if r.status_code != 200:
return [] return []
data = r.json() data = r.json()
results = data.get("results", [])[:15] # top 15 per query results = data.get("results", [])[:25] # top 25 per query
out = [] out = []
for res in results: for res in results:
title = res.get("title", "") title = res.get("title", "")
@ -363,56 +427,41 @@ async def score_candidates(
return scored return scored
async def prefilter_candidates( async def prefilter_batch(
client: httpx.AsyncClient, candidates: list[dict] client: httpx.AsyncClient, batch: list[dict], batch_offset: int
) -> list[dict]: ) -> list[dict]:
"""Use mistral-nemo to prefilter — keep actual album releases, drop noise.""" """Run prefilter on one batch. batch_offset adjusts index for provenance mapping."""
if not candidates:
return []
# Send in batches of 40
batch = candidates[:80]
items_for_prompt = [ items_for_prompt = [
{"index": i, "artist": c.get("artist", ""), "album": c.get("album", ""), "url": c.get("url", "")} {"index": i, "artist": c.get("artist", ""), "album": c.get("album", ""), "url": c.get("url", "")}
for i, c in enumerate(batch) for i, c in enumerate(batch)
] ]
prompt = f"""You are a metal music expert. Review these {len(items_for_prompt)} items from web searches. prompt = f"""You are a metal music expert. Review these {len(items_for_prompt)} items from web searches.
Return ONLY a JSON array of items that are actual metal album or EP releases (not news articles, not Wikipedia, not lists). Return ONLY a JSON array of items that are actual metal album or EP releases (not news, not Wikipedia, not lists, not tour dates).
For each item kept, include: index (integer), artist (string), album (string), subgenre (string), confidence (0.0-1.0). For each kept item include: index (integer), artist (string), album (string), subgenre (string), confidence (0.0-1.0).
Be inclusive keep anything that looks like it could be a real release even if you're not certain.
Items: Items:
{json.dumps(items_for_prompt, indent=2)} {json.dumps(items_for_prompt, indent=2)}
Return ONLY valid JSON array starting with [""" Return ONLY valid JSON array starting with ["""
body = { body = {
"model": "mistral-nemo:latest", "model": "mistral-nemo:latest",
"messages": [{"role": "user", "content": prompt}], "messages": [{"role": "user", "content": prompt}],
"stream": False, "stream": False,
"options": {"temperature": 0.1, "num_predict": 4000}, "options": {"temperature": 0.1, "num_predict": 6000},
} }
try: try:
r = await client.post( r = await client.post(f"{settings.ollama_url}/api/chat", json=body, timeout=150)
f"{settings.ollama_url}/api/chat",
json=body,
timeout=120,
)
raw = r.json().get("message", {}).get("content", "") if r.status_code == 200 else "" raw = r.json().get("message", {}).get("content", "") if r.status_code == 200 else ""
except Exception as e: except Exception as e:
log.error(f"Prefilter failed: {e}") log.warning(f"Prefilter batch failed: {e}")
return candidates[:40] # fall through with top candidates return batch # pass through on error
# Extract JSON from response
parsed = extract_json_array(raw) parsed = extract_json_array(raw)
if not parsed: if not parsed:
log.warning("Prefilter returned no parseable JSON — using all candidates") log.warning(f"Prefilter batch {batch_offset}: no JSON — passing through")
return candidates[:40] return batch
log.info(f"Prefilter kept {len(parsed)}/{len(batch)} candidates")
# Map back to full candidate objects with prefilter data
enriched = [] enriched = []
for p in parsed: for p in parsed:
idx = p.get("index") idx = p.get("index")
@ -426,10 +475,35 @@ Return ONLY valid JSON array starting with ["""
"confidence": p.get("confidence", 0.5), "confidence": p.get("confidence", 0.5),
} }
enriched.append(orig) enriched.append(orig)
return enriched return enriched
async def prefilter_candidates(
client: httpx.AsyncClient, candidates: list[dict]
) -> list[dict]:
"""Run prefilter on all candidates in parallel batches of 35."""
if not candidates:
return []
BATCH_SIZE = 35
batches = [candidates[i:i+BATCH_SIZE] for i in range(0, min(len(candidates), 200), BATCH_SIZE)]
log.info(f"Prefiltering {len(candidates)} candidates in {len(batches)} parallel batches")
# Run up to 3 batches concurrently to avoid OOMing Ollama
all_enriched = []
for chunk_start in range(0, len(batches), 3):
chunk = batches[chunk_start:chunk_start+3]
results = await asyncio.gather(*[
prefilter_batch(client, b, chunk_start * BATCH_SIZE + i * BATCH_SIZE)
for i, b in enumerate(chunk)
])
for r in results:
all_enriched.extend(r)
log.info(f"Prefilter kept {len(all_enriched)}/{len(candidates)} candidates")
return all_enriched
async def curate_picks( async def curate_picks(
client: httpx.AsyncClient, candidates: list[dict], taste: dict client: httpx.AsyncClient, candidates: list[dict], taste: dict
) -> list[dict]: ) -> list[dict]:
@ -437,7 +511,7 @@ async def curate_picks(
if not candidates: if not candidates:
return [] return []
top = sorted(candidates, key=lambda x: -x.get("embedScore", 0))[:20] top = sorted(candidates, key=lambda x: -x.get("embedScore", 0))[:30]
top_artists = [a["name"] for a in taste.get("topArtists", [])[:20]] top_artists = [a["name"] for a in taste.get("topArtists", [])[:20]]
recent_artists = [a["name"] for a in taste.get("recentArtists", [])[:10]] recent_artists = [a["name"] for a in taste.get("recentArtists", [])[:10]]
year = datetime.now().year year = datetime.now().year
@ -497,19 +571,19 @@ Return ONLY a JSON array starting with ["""
raw = r.json().get("message", {}).get("content", "") if r.status_code == 200 else "" raw = r.json().get("message", {}).get("content", "") if r.status_code == 200 else ""
picks = extract_json_array(raw) picks = extract_json_array(raw)
if picks: if picks:
# Validate provenance: url must match a search result OR index must be valid # Validate provenance: keep picks that reference a real input item
validated = [] validated = []
for p in picks: for p in picks:
if p.get("score", 0) < 60: if p.get("score", 0) < 50:
continue continue
idx = p.get("index") idx = p.get("index")
url = p.get("url", "") url = p.get("url", "")
if idx is not None and idx in top_by_index: if idx is not None and idx in top_by_index:
# Merge URL from original candidate if LLM dropped it # Always use original URL from search result
if not url: p["url"] = top_by_index[idx].get("url", url)
p["url"] = top_by_index[idx].get("url", "")
validated.append(p) validated.append(p)
elif url and url in known_urls: elif url and any(url.startswith(ku[:40]) for ku in known_urls if ku):
# URL prefix match (handles trailing-slash variants)
validated.append(p) validated.append(p)
else: else:
log.debug(f"Dropped hallucinated pick: {p.get('artist')}{p.get('album')}") log.debug(f"Dropped hallucinated pick: {p.get('artist')}{p.get('album')}")
@ -650,11 +724,11 @@ async def pipeline_run(run_id: str):
log_step("Curating picks with Mistral") log_step("Curating picks with Mistral")
picks = await curate_picks(client, prefiltered, taste) picks = await curate_picks(client, prefiltered, taste)
log_step(f"{len(picks)} picks scored ≥60") log_step(f"{len(picks)} picks scored ≥50")
if picks: if picks:
# Mark all prefiltered candidates as seen (not just picks) to avoid re-processing # Mark only actual picks as seen — unselected candidates stay eligible for re-eval
await mark_seen(prefiltered) await mark_seen(picks)
await write_picks(picks, { await write_picks(picks, {
"candidates": len(candidates), "candidates": len(candidates),
"filtered": len(prefiltered), "filtered": len(prefiltered),

Loading…
Cancel
Save