diff --git a/agent.py b/agent.py index 65dfe83..c23f6ba 100644 --- a/agent.py +++ b/agent.py @@ -197,27 +197,82 @@ def is_noise(title: str, url: str) -> bool: return any(p.search(text) for p in NOISE_PATTERNS) +LABEL_SUFFIXES = re.compile( + r'\s*[-–|]\s*(bandcamp|records|recordings|productions|music|metal|label|distro|' + r'sentient ruin|dark descent|20 buck spin|relapse|prosthetic|nuclear blast|' + r'season of mist|century media|profound lore|iron bonehead|hells headbangers|' + r'blood harvest|invictus productions|metal blade|redefining darkness)\b.*$', + re.I +) + + +def clean_name(s: str) -> str: + """Strip label suffixes and trim whitespace.""" + return LABEL_SUFFIXES.sub('', s).strip().strip('|').strip() + + def parse_title(title: str) -> tuple[str, str]: """Extract (artist, album) from common title formats.""" - # "Album Title, by Artist Name" — Bandcamp format + # "Album Title, by Artist Name" — Bandcamp standard format m = re.match(r'^(.+?),\s+by\s+(.+)$', title, re.I) if m: - return m.group(2).strip(), m.group(1).strip() + return clean_name(m.group(2)), clean_name(m.group(1)) + + # Bandcamp search result: "Album Title | Artist Name — Label - Bandcamp" + # Stop group 2 at em-dash so label suffix doesn't bleed in + pipe_m = re.match(r'^(.+?)\s*\|\s*([^—–]+?)(?:\s*[—–].+)?$', title) + if pipe_m: + left = clean_name(pipe_m.group(1)) + right = clean_name(pipe_m.group(2)) + # Bandcamp puts album first, artist second + if right and len(right) < 60: + return right, left + # "Artist - Album" or "Artist – Album" - m = re.match(r'^([^-–]+?)\s*[-–]\s*(.+)$', title) + m = re.match(r'^([^-–|]+?)\s*[-–]\s*(.+)$', title) if m and len(m.group(1)) < 60: - return m.group(1).strip(), m.group(2).strip() - return '', title.strip() + return clean_name(m.group(1)), clean_name(m.group(2)) + + return '', clean_name(title) def artist_from_url(url: str) -> str: """Extract artist slug from Bandcamp URL.""" m = re.match(r'https?://([^.]+)\.bandcamp\.com', url, re.I) if m: - return m.group(1).replace('-', ' ') + return m.group(1).replace('-', ' ').title() return '' +_BAD_FIELD = re.compile( + r'bandcamp|recordings?|productions?|distro|\|', re.I +) + +# Known review/blog domains — artist should never be the site name +_REVIEW_DOMAINS = re.compile( + r'metalinjection\.net|cvltnation\.com|decibelmag\.com|invisibleoranges\.com|' + r'brooklynvegan\.com|heavyblogisheavy\.com|angrymetalguy\.com|sputnikmusic\.com|' + r'nocleansinging\.com|meatmeadmetal\.com|terrorizer\.com|kerrang\.com|' + r'loudwire\.com|nocturnalcult\.com|themetalcrypt\.com|rateyourmusic\.com', + re.I +) + + +def looks_like_bad_pick(p: dict) -> bool: + """True if artist/album fields are clearly garbage.""" + artist = p.get("artist", "") + album = p.get("album", "") + if not artist and not album: + return True + if _BAD_FIELD.search(artist): + return True + if "|" in album: + return True + if len(artist) > 70 or len(album) > 120: + return True + return False + + # ── DEDUP ──────────────────────────────────────────────────────────────────── async def init_dedup_db(): @@ -598,6 +653,55 @@ Return ONLY a JSON array starting with [""" return [] +def sanitise_picks(picks: list[dict], source_candidates: list[dict]) -> list[dict]: + """Final cleanup pass: fix obscurity values, drop garbage artist labels.""" + # Build URL → candidate map for fallback artist extraction + url_map = {c.get("url", ""): c for c in source_candidates if c.get("url")} + + clean = [] + for p in picks: + # Normalise obscurity to values the dashboard understands + obs = (p.get("obscurity") or "").lower() + if "underground" in obs or obs in ("high", "demo", "diy"): + p["obscurity"] = "high" + elif "indie" in obs or "cult" in obs or obs == "medium": + p["obscurity"] = "medium" + else: + p["obscurity"] = "low" + + # If URL is a review article, artist shouldn't be the site/blog name + url = p.get("url", "") + if _REVIEW_DOMAINS.search(url): + domain_slug = re.sub(r'https?://(www\.)?', '', url).split('/')[0].split('.')[0] + if domain_slug.lower() in (p.get("artist") or "").lower().replace(' ', ''): + # artist IS the domain — try to recover from prefilter data + orig = url_map.get(url) + if orig and orig.get("artist"): + p["artist"] = orig["artist"] + p["album"] = orig.get("album") or p.get("album", "") + else: + # Can't recover — drop this pick + continue + + # If artist looks bad, try to recover from search result + if _BAD_FIELD.search(p.get("artist", "")): + orig = url_map.get(p.get("url", "")) + if orig: + artist_fallback, album_fallback = parse_title(orig.get("title", "")) + if not artist_fallback: + artist_fallback = artist_from_url(orig.get("url", "")) + p["artist"] = artist_fallback or orig.get("artist", p["artist"]) + if not p.get("album"): + p["album"] = album_fallback + + if looks_like_bad_pick(p): + log.debug(f"Dropping bad pick after cleanup: {p.get('artist')} / {p.get('album')}") + continue + clean.append(p) + + return clean + + def extract_json_array(text: str) -> list: """Robustly extract a JSON array from LLM output.""" if not text: @@ -724,7 +828,8 @@ async def pipeline_run(run_id: str): log_step("Curating picks with Mistral") picks = await curate_picks(client, prefiltered, taste) - log_step(f"{len(picks)} picks scored ≥50") + picks = sanitise_picks(picks, scored) + log_step(f"{len(picks)} picks after cleanup") if picks: # Mark only actual picks as seen — unselected candidates stay eligible for re-eval