Fix pick quality: Bandcamp title parsing, sanitise pass, dashboard esc

Parsing: - Handle "Album | Artist — Label - Bandcamp" title format (common Bandcamp search result pattern) — stops group at em-dash so label name doesn't bleed into artist field - clean_name() strips label suffixes from parsed tokens - artist_from_url() now title-cases Bandcamp slug - looks_like_bad_pick() checks album for pipes, broader regex for 'records'/'bandcamp' without word-boundary requirement Sanitise pass (post-curator): - Normalise obscurity to high/medium/low (dashboard badge values) - Drop picks where artist field contains 'bandcamp'/'records'/pipe - Detect when a review blog domain name was extracted as the artist; attempt recovery from original search result or drop the pick - Review domain blocklist: metalinjection, cvltnation, angrymetalguy, nocleansinging, meatmeadmetal, decibelmag, and others Dashboard fix: - esc() now escapes single quotes (') to prevent broken onclick attributes when album/why fields contain apostrophes Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
3 months ago · f6b84ee22f
parent 39d6051a1f
commit f6b84ee22f
1 changed files with 112 additions and 7 deletions
--- a/agent.py
+++ b/agent.py
@ -197,27 +197,82 @@ def is_noise(title: str, url: str) -> bool:
    return any(p.search(text) for p in NOISE_PATTERNS)


+LABEL_SUFFIXES = re.compile(
+    r'\s*[-–|]\s*(bandcamp|records|recordings|productions|music|metal|label|distro|'
+    r'sentient ruin|dark descent|20 buck spin|relapse|prosthetic|nuclear blast|'
+    r'season of mist|century media|profound lore|iron bonehead|hells headbangers|'
+    r'blood harvest|invictus productions|metal blade|redefining darkness)\b.*$',
+    re.I
+)
+
+
+def clean_name(s: str) -> str:
+    """Strip label suffixes and trim whitespace."""
+    return LABEL_SUFFIXES.sub('', s).strip().strip('|').strip()
+
+
 def parse_title(title: str) -> tuple[str, str]:
    """Extract (artist, album) from common title formats."""
-    # "Album Title, by Artist Name" — Bandcamp format
+    # "Album Title, by Artist Name" — Bandcamp standard format
    m = re.match(r'^(.+?),\s+by\s+(.+)$', title, re.I)
    if m:
-        return m.group(2).strip(), m.group(1).strip()
+        return clean_name(m.group(2)), clean_name(m.group(1))
+
+    # Bandcamp search result: "Album Title | Artist Name — Label - Bandcamp"
+    # Stop group 2 at em-dash so label suffix doesn't bleed in
+    pipe_m = re.match(r'^(.+?)\s*\|\s*([^—–]+?)(?:\s*[—–].+)?$', title)
+    if pipe_m:
+        left = clean_name(pipe_m.group(1))
+        right = clean_name(pipe_m.group(2))
+        # Bandcamp puts album first, artist second
+        if right and len(right) < 60:
+            return right, left
+
    # "Artist - Album" or "Artist – Album"
-    m = re.match(r'^([^-–]+?)\s*[-–]\s*(.+)$', title)
+    m = re.match(r'^([^-–|]+?)\s*[-–]\s*(.+)$', title)
    if m and len(m.group(1)) < 60:
-        return m.group(1).strip(), m.group(2).strip()
-    return '', title.strip()
+        return clean_name(m.group(1)), clean_name(m.group(2))
+
+    return '', clean_name(title)


 def artist_from_url(url: str) -> str:
    """Extract artist slug from Bandcamp URL."""
    m = re.match(r'https?://([^.]+)\.bandcamp\.com', url, re.I)
    if m:
-        return m.group(1).replace('-', ' ')
+        return m.group(1).replace('-', ' ').title()
    return ''


+_BAD_FIELD = re.compile(
+    r'bandcamp|recordings?|productions?|distro|\|', re.I
+)
+
+# Known review/blog domains — artist should never be the site name
+_REVIEW_DOMAINS = re.compile(
+    r'metalinjection\.net|cvltnation\.com|decibelmag\.com|invisibleoranges\.com|'
+    r'brooklynvegan\.com|heavyblogisheavy\.com|angrymetalguy\.com|sputnikmusic\.com|'
+    r'nocleansinging\.com|meatmeadmetal\.com|terrorizer\.com|kerrang\.com|'
+    r'loudwire\.com|nocturnalcult\.com|themetalcrypt\.com|rateyourmusic\.com',
+    re.I
+)
+
+
+def looks_like_bad_pick(p: dict) -> bool:
+    """True if artist/album fields are clearly garbage."""
+    artist = p.get("artist", "")
+    album = p.get("album", "")
+    if not artist and not album:
+        return True
+    if _BAD_FIELD.search(artist):
+        return True
+    if "|" in album:
+        return True
+    if len(artist) > 70 or len(album) > 120:
+        return True
+    return False
+
+
 # ── DEDUP ────────────────────────────────────────────────────────────────────

 async def init_dedup_db():
@ -598,6 +653,55 @@ Return ONLY a JSON array starting with ["""
    return []


+def sanitise_picks(picks: list[dict], source_candidates: list[dict]) -> list[dict]:
+    """Final cleanup pass: fix obscurity values, drop garbage artist labels."""
+    # Build URL → candidate map for fallback artist extraction
+    url_map = {c.get("url", ""): c for c in source_candidates if c.get("url")}
+
+    clean = []
+    for p in picks:
+        # Normalise obscurity to values the dashboard understands
+        obs = (p.get("obscurity") or "").lower()
+        if "underground" in obs or obs in ("high", "demo", "diy"):
+            p["obscurity"] = "high"
+        elif "indie" in obs or "cult" in obs or obs == "medium":
+            p["obscurity"] = "medium"
+        else:
+            p["obscurity"] = "low"
+
+        # If URL is a review article, artist shouldn't be the site/blog name
+        url = p.get("url", "")
+        if _REVIEW_DOMAINS.search(url):
+            domain_slug = re.sub(r'https?://(www\.)?', '', url).split('/')[0].split('.')[0]
+            if domain_slug.lower() in (p.get("artist") or "").lower().replace(' ', ''):
+                # artist IS the domain — try to recover from prefilter data
+                orig = url_map.get(url)
+                if orig and orig.get("artist"):
+                    p["artist"] = orig["artist"]
+                    p["album"] = orig.get("album") or p.get("album", "")
+                else:
+                    # Can't recover — drop this pick
+                    continue
+
+        # If artist looks bad, try to recover from search result
+        if _BAD_FIELD.search(p.get("artist", "")):
+            orig = url_map.get(p.get("url", ""))
+            if orig:
+                artist_fallback, album_fallback = parse_title(orig.get("title", ""))
+                if not artist_fallback:
+                    artist_fallback = artist_from_url(orig.get("url", ""))
+                p["artist"] = artist_fallback or orig.get("artist", p["artist"])
+                if not p.get("album"):
+                    p["album"] = album_fallback
+
+        if looks_like_bad_pick(p):
+            log.debug(f"Dropping bad pick after cleanup: {p.get('artist')} / {p.get('album')}")
+            continue
+        clean.append(p)
+
+    return clean
+
+
 def extract_json_array(text: str) -> list:
    """Robustly extract a JSON array from LLM output."""
    if not text:
@ -724,7 +828,8 @@ async def pipeline_run(run_id: str):

            log_step("Curating picks with Mistral")
            picks = await curate_picks(client, prefiltered, taste)
-            log_step(f"{len(picks)} picks scored ≥50")
+            picks = sanitise_picks(picks, scored)
+            log_step(f"{len(picks)} picks after cleanup")

            if picks:
                # Mark only actual picks as seen — unselected candidates stay eligible for re-eval