Fix pick quality: Bandcamp title parsing, sanitise pass, dashboard esc

Parsing:
- Handle "Album | Artist — Label - Bandcamp" title format (common
  Bandcamp search result pattern) — stops group at em-dash so label
  name doesn't bleed into artist field
- clean_name() strips label suffixes from parsed tokens
- artist_from_url() now title-cases Bandcamp slug
- looks_like_bad_pick() checks album for pipes, broader regex for
  'records'/'bandcamp' without word-boundary requirement

Sanitise pass (post-curator):
- Normalise obscurity to high/medium/low (dashboard badge values)
- Drop picks where artist field contains 'bandcamp'/'records'/pipe
- Detect when a review blog domain name was extracted as the artist;
  attempt recovery from original search result or drop the pick
- Review domain blocklist: metalinjection, cvltnation, angrymetalguy,
  nocleansinging, meatmeadmetal, decibelmag, and others

Dashboard fix:
- esc() now escapes single quotes (') to prevent broken onclick
  attributes when album/why fields contain apostrophes

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
main
nick2day 3 months ago
parent 39d6051a1f
commit f6b84ee22f

@ -197,27 +197,82 @@ def is_noise(title: str, url: str) -> bool:
return any(p.search(text) for p in NOISE_PATTERNS) return any(p.search(text) for p in NOISE_PATTERNS)
LABEL_SUFFIXES = re.compile(
r'\s*[-|]\s*(bandcamp|records|recordings|productions|music|metal|label|distro|'
r'sentient ruin|dark descent|20 buck spin|relapse|prosthetic|nuclear blast|'
r'season of mist|century media|profound lore|iron bonehead|hells headbangers|'
r'blood harvest|invictus productions|metal blade|redefining darkness)\b.*$',
re.I
)
def clean_name(s: str) -> str:
"""Strip label suffixes and trim whitespace."""
return LABEL_SUFFIXES.sub('', s).strip().strip('|').strip()
def parse_title(title: str) -> tuple[str, str]: def parse_title(title: str) -> tuple[str, str]:
"""Extract (artist, album) from common title formats.""" """Extract (artist, album) from common title formats."""
# "Album Title, by Artist Name" — Bandcamp format # "Album Title, by Artist Name" — Bandcamp standard format
m = re.match(r'^(.+?),\s+by\s+(.+)$', title, re.I) m = re.match(r'^(.+?),\s+by\s+(.+)$', title, re.I)
if m: if m:
return m.group(2).strip(), m.group(1).strip() return clean_name(m.group(2)), clean_name(m.group(1))
# Bandcamp search result: "Album Title | Artist Name — Label - Bandcamp"
# Stop group 2 at em-dash so label suffix doesn't bleed in
pipe_m = re.match(r'^(.+?)\s*\|\s*([^—–]+?)(?:\s*[—–].+)?$', title)
if pipe_m:
left = clean_name(pipe_m.group(1))
right = clean_name(pipe_m.group(2))
# Bandcamp puts album first, artist second
if right and len(right) < 60:
return right, left
# "Artist - Album" or "Artist Album" # "Artist - Album" or "Artist Album"
m = re.match(r'^([^-]+?)\s*[-]\s*(.+)$', title) m = re.match(r'^([^-|]+?)\s*[-]\s*(.+)$', title)
if m and len(m.group(1)) < 60: if m and len(m.group(1)) < 60:
return m.group(1).strip(), m.group(2).strip() return clean_name(m.group(1)), clean_name(m.group(2))
return '', title.strip()
return '', clean_name(title)
def artist_from_url(url: str) -> str: def artist_from_url(url: str) -> str:
"""Extract artist slug from Bandcamp URL.""" """Extract artist slug from Bandcamp URL."""
m = re.match(r'https?://([^.]+)\.bandcamp\.com', url, re.I) m = re.match(r'https?://([^.]+)\.bandcamp\.com', url, re.I)
if m: if m:
return m.group(1).replace('-', ' ') return m.group(1).replace('-', ' ').title()
return '' return ''
_BAD_FIELD = re.compile(
r'bandcamp|recordings?|productions?|distro|\|', re.I
)
# Known review/blog domains — artist should never be the site name
_REVIEW_DOMAINS = re.compile(
r'metalinjection\.net|cvltnation\.com|decibelmag\.com|invisibleoranges\.com|'
r'brooklynvegan\.com|heavyblogisheavy\.com|angrymetalguy\.com|sputnikmusic\.com|'
r'nocleansinging\.com|meatmeadmetal\.com|terrorizer\.com|kerrang\.com|'
r'loudwire\.com|nocturnalcult\.com|themetalcrypt\.com|rateyourmusic\.com',
re.I
)
def looks_like_bad_pick(p: dict) -> bool:
"""True if artist/album fields are clearly garbage."""
artist = p.get("artist", "")
album = p.get("album", "")
if not artist and not album:
return True
if _BAD_FIELD.search(artist):
return True
if "|" in album:
return True
if len(artist) > 70 or len(album) > 120:
return True
return False
# ── DEDUP ──────────────────────────────────────────────────────────────────── # ── DEDUP ────────────────────────────────────────────────────────────────────
async def init_dedup_db(): async def init_dedup_db():
@ -598,6 +653,55 @@ Return ONLY a JSON array starting with ["""
return [] return []
def sanitise_picks(picks: list[dict], source_candidates: list[dict]) -> list[dict]:
"""Final cleanup pass: fix obscurity values, drop garbage artist labels."""
# Build URL → candidate map for fallback artist extraction
url_map = {c.get("url", ""): c for c in source_candidates if c.get("url")}
clean = []
for p in picks:
# Normalise obscurity to values the dashboard understands
obs = (p.get("obscurity") or "").lower()
if "underground" in obs or obs in ("high", "demo", "diy"):
p["obscurity"] = "high"
elif "indie" in obs or "cult" in obs or obs == "medium":
p["obscurity"] = "medium"
else:
p["obscurity"] = "low"
# If URL is a review article, artist shouldn't be the site/blog name
url = p.get("url", "")
if _REVIEW_DOMAINS.search(url):
domain_slug = re.sub(r'https?://(www\.)?', '', url).split('/')[0].split('.')[0]
if domain_slug.lower() in (p.get("artist") or "").lower().replace(' ', ''):
# artist IS the domain — try to recover from prefilter data
orig = url_map.get(url)
if orig and orig.get("artist"):
p["artist"] = orig["artist"]
p["album"] = orig.get("album") or p.get("album", "")
else:
# Can't recover — drop this pick
continue
# If artist looks bad, try to recover from search result
if _BAD_FIELD.search(p.get("artist", "")):
orig = url_map.get(p.get("url", ""))
if orig:
artist_fallback, album_fallback = parse_title(orig.get("title", ""))
if not artist_fallback:
artist_fallback = artist_from_url(orig.get("url", ""))
p["artist"] = artist_fallback or orig.get("artist", p["artist"])
if not p.get("album"):
p["album"] = album_fallback
if looks_like_bad_pick(p):
log.debug(f"Dropping bad pick after cleanup: {p.get('artist')} / {p.get('album')}")
continue
clean.append(p)
return clean
def extract_json_array(text: str) -> list: def extract_json_array(text: str) -> list:
"""Robustly extract a JSON array from LLM output.""" """Robustly extract a JSON array from LLM output."""
if not text: if not text:
@ -724,7 +828,8 @@ async def pipeline_run(run_id: str):
log_step("Curating picks with Mistral") log_step("Curating picks with Mistral")
picks = await curate_picks(client, prefiltered, taste) picks = await curate_picks(client, prefiltered, taste)
log_step(f"{len(picks)} picks scored ≥50") picks = sanitise_picks(picks, scored)
log_step(f"{len(picks)} picks after cleanup")
if picks: if picks:
# Mark only actual picks as seen — unselected candidates stay eligible for re-eval # Mark only actual picks as seen — unselected candidates stay eligible for re-eval

Loading…
Cancel
Save