Initial commit: FGS metal discovery standalone agent

FastAPI service replacing the 77-node n8n pipeline. Implements full
discovery pipeline: 29 rotating SearXNG queries, nomic-embed-text
scoring against Last.fm taste centroid, Mistral-nemo prefilter and
curator with provenance validation, SQLite dedup, writes to
metal-picks.json for the existing FGS dashboard.

Runs as systemd service on port 8766 (fgs-agent.home via Caddy).
n8n reduced to a 2-node schedule trigger → HTTP POST.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
main
nick2day 3 months ago
commit 05bb4193ad

5
.gitignore vendored

@ -0,0 +1,5 @@
venv/
__pycache__/
*.pyc
.env
dedup.db

@ -0,0 +1,721 @@
"""
fgs-agent First Gain Stage metal discovery pipeline
Standalone FastAPI service; n8n just POSTs to /api/fgs/run on a schedule.
Endpoints:
POST /api/fgs/run trigger a discovery run (async, returns run_id)
GET /api/fgs/status current run status
GET /api/fgs/picks read metal-picks.json
GET /api/fgs/health liveness check
"""
import asyncio
import json
import logging
import math
import re
import time
import uuid
from datetime import datetime, timezone
from typing import Any, Optional
import aiosqlite
import httpx
import uvicorn
from fastapi import BackgroundTasks, FastAPI
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from pydantic_settings import BaseSettings, SettingsConfigDict
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger("fgs-agent")
# ── Config ──────────────────────────────────────────────────────────────────
class Settings(BaseSettings):
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
lastfm_username: str = "nick2day"
lastfm_api_key: str = ""
searxng_url: str = "http://localhost:8080"
ollama_url: str = "http://localhost:11434"
picks_path: str = "/home/nick2day/.n8n/metal-picks.json"
dedup_db: str = "/home/nick2day/fgs-agent/dedup.db"
port: int = 8766
settings = Settings()
# ── SEARXNG QUERY BANK ───────────────────────────────────────────────────────
# All 29 queries — dynamic year, rotating subgenre index
SUBGENRES = [
"death metal", "black metal", "doom metal", "thrash metal", "sludge metal",
"progressive metal", "blackened death metal", "post-metal", "funeral doom", "grindcore",
"melodic death metal", "technical death metal", "atmospheric black metal", "war metal",
"drone metal", "stoner metal", "gothic metal", "power metal", "viking metal", "brutal death metal",
]
def build_queries() -> list[str]:
year = datetime.now().year
# rotate subgenre every 6 hours so successive runs hit different niches
idx = int(time.time() // 21600) % len(SUBGENRES)
sg = SUBGENRES[idx]
sg2 = SUBGENRES[(idx + 1) % len(SUBGENRES)]
sg3 = SUBGENRES[(idx + 2) % len(SUBGENRES)]
return [
# Subgenre new releases
f'{sg} new album release {year}',
f'{sg2} new album {year}',
f'{sg3} full album stream {year}',
# Bandcamp label searches
f'site:bandcamp.com "sentient ruin" new {year}',
f'site:bandcamp.com "20 buck spin" new {year}',
f'site:bandcamp.com "prosthetic records" {year}',
f'site:bandcamp.com "unique leader records" {year}',
f'site:bandcamp.com "redefining darkness" {year}',
f'site:bandcamp.com "profound lore" {year}',
f'site:bandcamp.com "season of mist" {year}',
f'site:bandcamp.com "century media" {year}',
f'site:bandcamp.com "nuclear blast" {year}',
# Bandcamp genre pages
f'site:bandcamp.com {sg} album {year}',
f'site:bandcamp.com {sg2} album {year}',
# Metal reviews / coverage
f'site:metal-archives.com new album review {year}',
f'site:metalinjection.net new album {year}',
f'site:cvltnation.com new album {year}',
f'site:decibelmag.com new album stream {year}',
f'site:invisibleoranges.com album review {year}',
f'site:brooklynvegan.com {sg} new album {year}',
# Obscure/underground
f'underground {sg} demo release {year}',
f'independent {sg} album bandcamp {year}',
f'new {sg} EP release {year}',
# Michigan / regional bias
f'Michigan metal band new album {year}',
f'Detroit metal band new release {year}',
# Best-of / roundup (useful for discovery)
f'best {sg} albums {year}',
f'new {sg} releases {year} bandcamp',
# Last.fm tag pages
f'site:last.fm {sg} new {year}',
# Full-album YouTube
f'"{sg}" "full album" {year} new band',
]
# ── NOISE FILTERS ────────────────────────────────────────────────────────────
NOISE_PATTERNS = [
re.compile(r'wikipedia\.org', re.I),
re.compile(r'\bin heavy metal music\b', re.I),
re.compile(r'album release calendar', re.I),
re.compile(r'most anticipated', re.I),
re.compile(r'\byear in review\b', re.I),
re.compile(r'^top \d+', re.I),
re.compile(r'albums you need to hear', re.I),
re.compile(r'upcoming.*releases', re.I),
re.compile(r'\bto release\b.*album', re.I),
re.compile(r'\bannounce[sd]?\b.*album', re.I),
re.compile(r'\bdelayed\b', re.I),
re.compile(r'best.*releases', re.I),
re.compile(r'\d+ metal albums.*20\d\d', re.I),
re.compile(r'shatner', re.I),
re.compile(r'mark your calendars', re.I),
re.compile(r'\bvideo game\b', re.I),
re.compile(r'\bsoundtrack\b', re.I),
re.compile(r'\btribute\b', re.I),
re.compile(r'\bcovers album\b', re.I),
re.compile(r'heavy metal music - wikipedia', re.I),
re.compile(r'in \d{4} - wikipedia', re.I),
]
def is_noise(title: str, url: str) -> bool:
text = f"{title} {url}"
return any(p.search(text) for p in NOISE_PATTERNS)
def parse_title(title: str) -> tuple[str, str]:
"""Extract (artist, album) from common title formats."""
# "Album Title, by Artist Name" — Bandcamp format
m = re.match(r'^(.+?),\s+by\s+(.+)$', title, re.I)
if m:
return m.group(2).strip(), m.group(1).strip()
# "Artist - Album" or "Artist Album"
m = re.match(r'^([^-]+?)\s*[-]\s*(.+)$', title)
if m and len(m.group(1)) < 60:
return m.group(1).strip(), m.group(2).strip()
return '', title.strip()
def artist_from_url(url: str) -> str:
"""Extract artist slug from Bandcamp URL."""
m = re.match(r'https?://([^.]+)\.bandcamp\.com', url, re.I)
if m:
return m.group(1).replace('-', ' ')
return ''
# ── DEDUP ────────────────────────────────────────────────────────────────────
async def init_dedup_db():
async with aiosqlite.connect(settings.dedup_db) as db:
await db.execute(
"CREATE TABLE IF NOT EXISTS seen (key TEXT PRIMARY KEY, ts INTEGER)"
)
await db.commit()
async def filter_unseen(candidates: list[dict]) -> list[dict]:
"""Remove already-seen albums. Returns only fresh candidates."""
async with aiosqlite.connect(settings.dedup_db) as db:
fresh = []
for c in candidates:
key = f"{c.get('artist','').lower()}::{c.get('album','').lower()}"
if not key or key == '::':
key = c.get('url', c.get('album', ''))
row = await db.execute("SELECT 1 FROM seen WHERE key=?", (key,))
if not await row.fetchone():
fresh.append(c)
return fresh
async def mark_seen(picks: list[dict]):
async with aiosqlite.connect(settings.dedup_db) as db:
ts = int(time.time())
for p in picks:
key = f"{p.get('artist','').lower()}::{p.get('album','').lower()}"
if key and key != '::':
await db.execute(
"INSERT OR IGNORE INTO seen (key, ts) VALUES (?, ?)", (key, ts)
)
await db.commit()
# ── PIPELINE STAGES ──────────────────────────────────────────────────────────
async def fetch_lastfm(client: httpx.AsyncClient) -> dict:
"""Fetch top artists + recent tracks from Last.fm."""
base = "https://ws.audioscrobbler.com/2.0/"
params_top = {
"method": "user.gettopartists",
"user": settings.lastfm_username,
"api_key": settings.lastfm_api_key,
"period": "12month",
"limit": "100",
"format": "json",
}
params_recent = {
"method": "user.getrecenttracks",
"user": settings.lastfm_username,
"api_key": settings.lastfm_api_key,
"limit": "1000",
"format": "json",
}
top_r, recent_r = await asyncio.gather(
client.get(base, params=params_top, timeout=15),
client.get(base, params=params_recent, timeout=15),
)
return {
"top": top_r.json() if top_r.status_code == 200 else {},
"recent": recent_r.json() if recent_r.status_code == 200 else {},
}
def build_taste_profile(lastfm: dict) -> dict:
"""Build taste centroid from Last.fm data."""
top_artists_raw = lastfm.get("top", {}).get("topartists", {}).get("artist", [])
recent_tracks_raw = lastfm.get("recent", {}).get("recenttracks", {}).get("track", [])
top_artists = [
{"name": a["name"], "playcount": int(a.get("playcount", 0)), "rank": int(a.get("@attr", {}).get("rank", 99))}
for a in top_artists_raw[:100]
]
recent_map: dict[str, int] = {}
for t in recent_tracks_raw:
name = (t.get("artist") or {}).get("#text", "")
if name:
recent_map[name] = recent_map.get(name, 0) + 1
recent_artists = sorted(recent_map.items(), key=lambda x: -x[1])[:20]
return {
"topArtists": top_artists,
"recentArtists": [{"name": n, "plays": p} for n, p in recent_artists],
}
async def search_candidates(client: httpx.AsyncClient) -> list[dict]:
"""Run all SearXNG queries concurrently, collect and deduplicate results."""
queries = build_queries()
log.info(f"Running {len(queries)} SearXNG queries")
async def one_query(q: str) -> list[dict]:
try:
r = await client.get(
f"{settings.searxng_url}/search",
params={"q": q, "format": "json", "time_range": "month"},
timeout=20,
)
if r.status_code != 200:
return []
data = r.json()
results = data.get("results", [])[:15] # top 15 per query
out = []
for res in results:
title = res.get("title", "")
url = res.get("url", "")
if is_noise(title, url):
continue
artist, album = parse_title(title)
if not artist:
artist = artist_from_url(url)
out.append({
"title": title,
"url": url,
"artist": artist,
"album": album,
"source": q[:60],
"content": (res.get("content") or "")[:200],
})
return out
except Exception as e:
log.warning(f"SearXNG query failed: {q[:40]}{e}")
return []
results = await asyncio.gather(*[one_query(q) for q in queries])
all_results = [r for batch in results for r in batch]
# Deduplicate by URL
seen_urls: set[str] = set()
unique = []
for r in all_results:
u = r["url"]
if u and u not in seen_urls:
seen_urls.add(u)
unique.append(r)
log.info(f"Collected {len(unique)} unique candidates from search")
return unique
async def embed_text(client: httpx.AsyncClient, text: str) -> list[float]:
try:
r = await client.post(
f"{settings.ollama_url}/api/embeddings",
json={"model": "nomic-embed-text", "prompt": text},
timeout=30,
)
if r.status_code == 200:
return r.json().get("embedding", [])
except Exception as e:
log.warning(f"Embed failed: {e}")
return []
def cosine_similarity(a: list[float], b: list[float]) -> float:
if not a or not b or len(a) != len(b):
return 0.0
dot = sum(x * y for x, y in zip(a, b))
mag_a = math.sqrt(sum(x * x for x in a))
mag_b = math.sqrt(sum(x * x for x in b))
if mag_a == 0 or mag_b == 0:
return 0.0
return dot / (mag_a * mag_b)
async def score_candidates(
client: httpx.AsyncClient, candidates: list[dict], taste: dict
) -> list[dict]:
"""Embed candidates and score against taste centroid."""
if not candidates:
return []
# Build taste centroid from top artists
top_names = [a["name"] for a in taste.get("topArtists", [])[:30]]
centroid_text = "Heavy metal artist: " + ", ".join(top_names)
log.info("Embedding taste centroid")
centroid_vec = await embed_text(client, centroid_text)
if not centroid_vec:
log.warning("Centroid embedding failed — scoring by 0")
for c in candidates:
c["embedScore"] = 0.0
return candidates
log.info(f"Scoring {len(candidates)} candidates against taste centroid")
scored = []
# Batch embed in groups to avoid hammering Ollama
for i, c in enumerate(candidates):
text = f"Metal album: {c['artist']} - {c['album']}" if c.get("artist") else f"Metal album: {c['album']}"
vec = await embed_text(client, text)
c["embedScore"] = round(cosine_similarity(centroid_vec, vec), 4) if vec else 0.0
if i % 20 == 0:
log.info(f" scored {i}/{len(candidates)}")
scored.append(c)
scored.sort(key=lambda x: -x.get("embedScore", 0))
return scored
async def prefilter_candidates(
client: httpx.AsyncClient, candidates: list[dict]
) -> list[dict]:
"""Use mistral-nemo to prefilter — keep actual album releases, drop noise."""
if not candidates:
return []
# Send in batches of 40
batch = candidates[:80]
items_for_prompt = [
{"index": i, "artist": c.get("artist", ""), "album": c.get("album", ""), "url": c.get("url", "")}
for i, c in enumerate(batch)
]
prompt = f"""You are a metal music expert. Review these {len(items_for_prompt)} items from web searches.
Return ONLY a JSON array of items that are actual metal album or EP releases (not news articles, not Wikipedia, not lists).
For each item kept, include: index (integer), artist (string), album (string), subgenre (string), confidence (0.0-1.0).
Items:
{json.dumps(items_for_prompt, indent=2)}
Return ONLY valid JSON array starting with ["""
body = {
"model": "mistral-nemo:latest",
"messages": [{"role": "user", "content": prompt}],
"stream": False,
"options": {"temperature": 0.1, "num_predict": 4000},
}
try:
r = await client.post(
f"{settings.ollama_url}/api/chat",
json=body,
timeout=120,
)
raw = r.json().get("message", {}).get("content", "") if r.status_code == 200 else ""
except Exception as e:
log.error(f"Prefilter failed: {e}")
return candidates[:40] # fall through with top candidates
# Extract JSON from response
parsed = extract_json_array(raw)
if not parsed:
log.warning("Prefilter returned no parseable JSON — using all candidates")
return candidates[:40]
log.info(f"Prefilter kept {len(parsed)}/{len(batch)} candidates")
# Map back to full candidate objects with prefilter data
enriched = []
for p in parsed:
idx = p.get("index")
if idx is None or idx >= len(batch):
continue
orig = batch[idx].copy()
orig["artist"] = p.get("artist") or orig.get("artist", "")
orig["album"] = p.get("album") or orig.get("album", "")
orig["prefilter"] = {
"subgenre": p.get("subgenre", "unknown"),
"confidence": p.get("confidence", 0.5),
}
enriched.append(orig)
return enriched
async def curate_picks(
client: httpx.AsyncClient, candidates: list[dict], taste: dict
) -> list[dict]:
"""Use mistral-nemo to produce final scored picks with full metadata."""
if not candidates:
return []
top = sorted(candidates, key=lambda x: -x.get("embedScore", 0))[:20]
top_artists = [a["name"] for a in taste.get("topArtists", [])[:20]]
recent_artists = [a["name"] for a in taste.get("recentArtists", [])[:10]]
year = datetime.now().year
items_for_prompt = [
{
"index": i,
"artist": c.get("artist", ""),
"album": c.get("album", ""),
"subgenre": (c.get("prefilter") or {}).get("subgenre", "unknown"),
"url": c.get("url", ""),
"source": c.get("source", ""),
}
for i, c in enumerate(top)
]
prompt = f"""You are a heavy metal curator reviewing NEW {year} releases found via web search.
IMPORTANT: Only score albums from the list below. Do NOT recommend albums not in this list.
Do NOT recommend classic albums. These must be actual {year} releases found at the provided URLs.
If you are not sure an item is a real new release, skip it.
nick2day's taste profile:
- Top artists (12-month): {', '.join(top_artists)}
- Recent listens: {', '.join(recent_artists)}
New {year} releases to evaluate (use ONLY these):
{json.dumps(items_for_prompt, indent=2)}
For each item worth recommending, output a JSON object with:
index (integer, from above), artist, album, score (0-100, based on fit to taste profile),
subgenre, obscurity (underground/indie/major), similar_to (2-3 artists from taste profile),
why (1 sentence), review_angle (unique angle), tags (string array), url (copy from above)
Skip items without a clear artist + album. Skip classics or non-{year} releases.
Return ONLY a JSON array starting with ["""
body = {
"model": "mistral-nemo:latest",
"messages": [{"role": "user", "content": prompt}],
"stream": False,
"options": {"temperature": 0.3, "num_predict": 8000},
}
# Build index → candidate map for provenance validation
top_by_index = {i: c for i, c in enumerate(top)}
known_urls = {c.get("url", "") for c in top if c.get("url")}
raw = ""
for attempt in range(3):
try:
r = await client.post(
f"{settings.ollama_url}/api/chat",
json=body,
timeout=180,
)
raw = r.json().get("message", {}).get("content", "") if r.status_code == 200 else ""
picks = extract_json_array(raw)
if picks:
# Validate provenance: url must match a search result OR index must be valid
validated = []
for p in picks:
if p.get("score", 0) < 60:
continue
idx = p.get("index")
url = p.get("url", "")
if idx is not None and idx in top_by_index:
# Merge URL from original candidate if LLM dropped it
if not url:
p["url"] = top_by_index[idx].get("url", "")
validated.append(p)
elif url and url in known_urls:
validated.append(p)
else:
log.debug(f"Dropped hallucinated pick: {p.get('artist')}{p.get('album')}")
log.info(f"Curator returned {len(validated)} validated picks (attempt {attempt+1})")
if validated:
return validated
except Exception as e:
log.error(f"Curator attempt {attempt+1} failed: {e}")
await asyncio.sleep(2)
log.warning("Curator failed all attempts")
return []
def extract_json_array(text: str) -> list:
"""Robustly extract a JSON array from LLM output."""
if not text:
return []
# Strip markdown fences
cleaned = re.sub(r'```(?:json)?\s*', '', text).strip()
# Try direct parse
try:
data = json.loads(cleaned)
if isinstance(data, list):
return data
if isinstance(data, dict):
for v in data.values():
if isinstance(v, list):
return v
except Exception:
pass
# Find first [ to last ]
start = cleaned.find('[')
end = cleaned.rfind(']')
if start != -1 and end > start:
try:
data = json.loads(cleaned[start:end+1])
if isinstance(data, list):
return data
except Exception:
pass
return []
async def write_picks(new_picks: list[dict], run_stats: dict):
"""Append new picks to metal-picks.json."""
try:
with open(settings.picks_path, "r") as f:
store = json.load(f)
except Exception:
store = {"allPicks": [], "runs": [], "lastUpdated": ""}
# Add run record
run = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"source": "fgs-agent",
"picksAdded": len(new_picks),
"candidatesSearched": run_stats.get("candidates", 0),
"candidatesFiltered": run_stats.get("filtered", 0),
}
store["runs"] = ([run] + store.get("runs", []))[:10]
store["allPicks"] = (new_picks + store.get("allPicks", []))[:200]
store["lastUpdated"] = run["timestamp"]
with open(settings.picks_path, "w") as f:
json.dump(store, f, indent=2)
log.info(f"Wrote {len(new_picks)} new picks to {settings.picks_path}")
# ── RUN STATE ────────────────────────────────────────────────────────────────
run_state: dict[str, Any] = {
"status": "idle",
"run_id": None,
"started": None,
"finished": None,
"picks_added": 0,
"candidates_found": 0,
"error": None,
"log": [],
}
_run_lock = asyncio.Lock()
async def pipeline_run(run_id: str):
global run_state
def log_step(msg: str):
log.info(msg)
run_state["log"].append({"ts": datetime.now(timezone.utc).isoformat(), "msg": msg})
async with _run_lock:
run_state.update({
"status": "running",
"run_id": run_id,
"started": datetime.now(timezone.utc).isoformat(),
"finished": None,
"picks_added": 0,
"candidates_found": 0,
"error": None,
"log": [],
})
try:
async with httpx.AsyncClient() as client:
log_step("Fetching Last.fm data")
lastfm = await fetch_lastfm(client)
taste = build_taste_profile(lastfm)
log_step(f"Taste profile: {len(taste['topArtists'])} top artists, {len(taste['recentArtists'])} recent")
log_step("Searching candidates")
candidates = await search_candidates(client)
run_state["candidates_found"] = len(candidates)
if not candidates:
log_step("No candidates found — ending run")
run_state["status"] = "done"
run_state["finished"] = datetime.now(timezone.utc).isoformat()
return
log_step("Filtering seen albums")
fresh = await filter_unseen(candidates)
log_step(f"{len(fresh)} fresh candidates (of {len(candidates)} total)")
if not fresh:
log_step("All candidates already seen — ending run")
run_state["status"] = "done"
run_state["finished"] = datetime.now(timezone.utc).isoformat()
return
log_step("Scoring by embedding similarity")
scored = await score_candidates(client, fresh, taste)
log_step("Prefiltering with Mistral")
prefiltered = await prefilter_candidates(client, scored)
log_step(f"{len(prefiltered)} candidates passed prefilter")
log_step("Curating picks with Mistral")
picks = await curate_picks(client, prefiltered, taste)
log_step(f"{len(picks)} picks scored ≥60")
if picks:
# Mark all prefiltered candidates as seen (not just picks) to avoid re-processing
await mark_seen(prefiltered)
await write_picks(picks, {
"candidates": len(candidates),
"filtered": len(prefiltered),
})
run_state.update({
"status": "done",
"finished": datetime.now(timezone.utc).isoformat(),
"picks_added": len(picks),
})
log_step(f"Run complete — {len(picks)} new picks added")
except Exception as e:
log.exception(f"Pipeline run failed: {e}")
run_state.update({
"status": "error",
"finished": datetime.now(timezone.utc).isoformat(),
"error": str(e),
})
# ── API ───────────────────────────────────────────────────────────────────────
app = FastAPI(title="fgs-agent", description="First Gain Stage metal discovery")
@app.on_event("startup")
async def startup():
await init_dedup_db()
log.info(f"fgs-agent started on port {settings.port}")
@app.post("/api/fgs/run")
async def trigger_run(background_tasks: BackgroundTasks):
if run_state["status"] == "running":
return JSONResponse({"status": "already_running", "run_id": run_state["run_id"]}, status_code=409)
run_id = str(uuid.uuid4())[:8]
background_tasks.add_task(pipeline_run, run_id)
return {"status": "started", "run_id": run_id}
@app.get("/api/fgs/status")
async def get_status():
return run_state
@app.get("/api/fgs/picks")
async def get_picks():
try:
with open(settings.picks_path, "r") as f:
return json.load(f)
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)
@app.get("/api/fgs/health")
async def health():
return {"status": "ok", "agent": "fgs-agent"}
# ── Entry point ───────────────────────────────────────────────────────────────
if __name__ == "__main__":
uvicorn.run("agent:app", host="0.0.0.0", port=settings.port, reload=False)

@ -0,0 +1,15 @@
[Unit]
Description=FGS Metal Discovery Agent
After=network.target
[Service]
Type=simple
User=nick2day
WorkingDirectory=/home/nick2day/fgs-agent
EnvironmentFile=/home/nick2day/fgs-agent/.env
ExecStart=/home/nick2day/fgs-agent/venv/bin/python agent.py
Restart=on-failure
RestartSec=10
[Install]
WantedBy=multi-user.target

@ -0,0 +1,7 @@
fastapi==0.115.6
uvicorn[standard]==0.32.1
httpx==0.28.1
pydantic-settings==2.7.0
python-dotenv==1.0.1
aiosqlite==0.20.0
numpy==1.26.4
Loading…
Cancel
Save