Fix pick quality: Bandcamp title parsing, sanitise pass, dashboard esc

Parsing: - Handle "Album | Artist — Label - Bandcamp" title format (common Bandcamp search result pattern) — stops group at em-dash so label name doesn't bleed into artist field - clean_name() strips label suffixes from parsed tokens - artist_from_url() now title-cases Bandcamp slug - looks_like_bad_pick() checks album for pipes, broader regex for 'records'/'bandcamp' without word-boundary requirement Sanitise pass (post-curator): - Normalise obscurity to high/medium/low (dashboard badge values) - Drop picks where artist field contains 'bandcamp'/'records'/pipe - Detect when a review blog domain name was extracted as the artist; attempt recovery from original search result or drop the pick - Review domain blocklist: metalinjection, cvltnation, angrymetalguy, nocleansinging, meatmeadmetal, decibelmag, and others Dashboard fix: - esc() now escapes single quotes (') to prevent broken onclick attributes when album/why fields contain apostrophes Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
3 months ago · f6b84ee22f
parent 39d6051a1f
commit f6b84ee22f
1 changed files with 112 additions and 7 deletions
--- a/agent.py
+++ b/agent.py
@ -197,27 +197,82 @@ def is_noise(title: str, url: str) -> bool:
    return any(p.search(text) for p in NOISE_PATTERNS)
 LABEL_SUFFIXES = re.compile(
    r'\s*[-–|]\s*(bandcamp|records|recordings|productions|music|metal|label|distro|'
    r'sentient ruin|dark descent|20 buck spin|relapse|prosthetic|nuclear blast|'
    r'season of mist|century media|profound lore|iron bonehead|hells headbangers|'
    r'blood harvest|invictus productions|metal blade|redefining darkness)\b.*$',
    re.I
 )
 def clean_name(s: str) -> str:
    """Strip label suffixes and trim whitespace."""
    return LABEL_SUFFIXES.sub('', s).strip().strip('|').strip()
 def parse_title(title: str) -> tuple[str, str]:
    """Extract (artist, album) from common title formats."""
-    # "Album Title, by Artist Name" — Bandcamp format
+    # "Album Title, by Artist Name" — Bandcamp standard format
    m = re.match(r'^(.+?),\s+by\s+(.+)$', title, re.I)
    if m:
-        return m.group(2).strip(), m.group(1).strip()
+        return clean_name(m.group(2)), clean_name(m.group(1))
    # Bandcamp search result: "Album Title | Artist Name — Label - Bandcamp"
    # Stop group 2 at em-dash so label suffix doesn't bleed in
    pipe_m = re.match(r'^(.+?)\s*\|\s*([^—–]+?)(?:\s*[—–].+)?$', title)
    if pipe_m:
        left = clean_name(pipe_m.group(1))
        right = clean_name(pipe_m.group(2))
        # Bandcamp puts album first, artist second
        if right and len(right) < 60:
            return right, left
    # "Artist - Album" or "Artist – Album"
-    m = re.match(r'^([^-–]+?)\s*[-–]\s*(.+)$', title)
+    m = re.match(r'^([^-–|]+?)\s*[-–]\s*(.+)$', title)
    if m and len(m.group(1)) < 60:
-        return m.group(1).strip(), m.group(2).strip()
+        return clean_name(m.group(1)), clean_name(m.group(2))
-    return '', title.strip()
+
    return '', clean_name(title)
 def artist_from_url(url: str) -> str:
    """Extract artist slug from Bandcamp URL."""
    m = re.match(r'https?://([^.]+)\.bandcamp\.com', url, re.I)
    if m:
-        return m.group(1).replace('-', ' ')
+        return m.group(1).replace('-', ' ').title()
    return ''
 _BAD_FIELD = re.compile(
    r'bandcamp|recordings?|productions?|distro|\|', re.I
 )
 # Known review/blog domains — artist should never be the site name
 _REVIEW_DOMAINS = re.compile(
    r'metalinjection\.net|cvltnation\.com|decibelmag\.com|invisibleoranges\.com|'
    r'brooklynvegan\.com|heavyblogisheavy\.com|angrymetalguy\.com|sputnikmusic\.com|'
    r'nocleansinging\.com|meatmeadmetal\.com|terrorizer\.com|kerrang\.com|'
    r'loudwire\.com|nocturnalcult\.com|themetalcrypt\.com|rateyourmusic\.com',
    re.I
 )
 def looks_like_bad_pick(p: dict) -> bool:
    """True if artist/album fields are clearly garbage."""
    artist = p.get("artist", "")
    album = p.get("album", "")
    if not artist and not album:
        return True
    if _BAD_FIELD.search(artist):
        return True
    if "|" in album:
        return True
    if len(artist) > 70 or len(album) > 120:
        return True
    return False
 # ── DEDUP ────────────────────────────────────────────────────────────────────
 async def init_dedup_db():
@ -598,6 +653,55 @@ Return ONLY a JSON array starting with ["""
    return []
 def sanitise_picks(picks: list[dict], source_candidates: list[dict]) -> list[dict]:
    """Final cleanup pass: fix obscurity values, drop garbage artist labels."""
    # Build URL → candidate map for fallback artist extraction
    url_map = {c.get("url", ""): c for c in source_candidates if c.get("url")}
    clean = []
    for p in picks:
        # Normalise obscurity to values the dashboard understands
        obs = (p.get("obscurity") or "").lower()
        if "underground" in obs or obs in ("high", "demo", "diy"):
            p["obscurity"] = "high"
        elif "indie" in obs or "cult" in obs or obs == "medium":
            p["obscurity"] = "medium"
        else:
            p["obscurity"] = "low"
        # If URL is a review article, artist shouldn't be the site/blog name
        url = p.get("url", "")
        if _REVIEW_DOMAINS.search(url):
            domain_slug = re.sub(r'https?://(www\.)?', '', url).split('/')[0].split('.')[0]
            if domain_slug.lower() in (p.get("artist") or "").lower().replace(' ', ''):
                # artist IS the domain — try to recover from prefilter data
                orig = url_map.get(url)
                if orig and orig.get("artist"):
                    p["artist"] = orig["artist"]
                    p["album"] = orig.get("album") or p.get("album", "")
                else:
                    # Can't recover — drop this pick
                    continue
        # If artist looks bad, try to recover from search result
        if _BAD_FIELD.search(p.get("artist", "")):
            orig = url_map.get(p.get("url", ""))
            if orig:
                artist_fallback, album_fallback = parse_title(orig.get("title", ""))
                if not artist_fallback:
                    artist_fallback = artist_from_url(orig.get("url", ""))
                p["artist"] = artist_fallback or orig.get("artist", p["artist"])
                if not p.get("album"):
                    p["album"] = album_fallback
        if looks_like_bad_pick(p):
            log.debug(f"Dropping bad pick after cleanup: {p.get('artist')} / {p.get('album')}")
            continue
        clean.append(p)
    return clean
 def extract_json_array(text: str) -> list:
    """Robustly extract a JSON array from LLM output."""
    if not text:
@ -724,7 +828,8 @@ async def pipeline_run(run_id: str):
            log_step("Curating picks with Mistral")
            picks = await curate_picks(client, prefiltered, taste)
-            log_step(f"{len(picks)} picks scored ≥50")
+            picks = sanitise_picks(picks, scored)
            log_step(f"{len(picks)} picks after cleanup")
            if picks:
                # Mark only actual picks as seen — unselected candidates stay eligible for re-eval