@ -197,27 +197,82 @@ def is_noise(title: str, url: str) -> bool:
return any ( p . search ( text ) for p in NOISE_PATTERNS )
LABEL_SUFFIXES = re . compile (
r ' \ s*[-– |] \ s*(bandcamp|records|recordings|productions|music|metal|label|distro| '
r ' sentient ruin|dark descent|20 buck spin|relapse|prosthetic|nuclear blast| '
r ' season of mist|century media|profound lore|iron bonehead|hells headbangers| '
r ' blood harvest|invictus productions|metal blade|redefining darkness) \ b.*$ ' ,
re . I
)
def clean_name ( s : str ) - > str :
""" Strip label suffixes and trim whitespace. """
return LABEL_SUFFIXES . sub ( ' ' , s ) . strip ( ) . strip ( ' | ' ) . strip ( )
def parse_title ( title : str ) - > tuple [ str , str ] :
""" Extract (artist, album) from common title formats. """
# "Album Title, by Artist Name" — Bandcamp format
# "Album Title, by Artist Name" — Bandcamp standard format
m = re . match ( r ' ^(.+?), \ s+by \ s+(.+)$ ' , title , re . I )
if m :
return m . group ( 2 ) . strip ( ) , m . group ( 1 ) . strip ( )
return clean_name ( m . group ( 2 ) ) , clean_name ( m . group ( 1 ) )
# Bandcamp search result: "Album Title | Artist Name — Label - Bandcamp"
# Stop group 2 at em-dash so label suffix doesn't bleed in
pipe_m = re . match ( r ' ^(.+?) \ s* \ | \ s*([^—–]+?)(?: \ s*[—–].+)?$ ' , title )
if pipe_m :
left = clean_name ( pipe_m . group ( 1 ) )
right = clean_name ( pipe_m . group ( 2 ) )
# Bandcamp puts album first, artist second
if right and len ( right ) < 60 :
return right , left
# "Artist - Album" or "Artist – Album"
m = re . match ( r ' ^([^-– ]+?) \ s*[-– ] \ s*(.+)$ ' , title )
m = re . match ( r ' ^([^-– | ]+?)\ s*[-– ] \ s*(.+)$ ' , title )
if m and len ( m . group ( 1 ) ) < 60 :
return m . group ( 1 ) . strip ( ) , m . group ( 2 ) . strip ( )
return ' ' , title . strip ( )
return clean_name ( m . group ( 1 ) ) , clean_name ( m . group ( 2 ) )
return ' ' , clean_name ( title )
def artist_from_url ( url : str ) - > str :
""" Extract artist slug from Bandcamp URL. """
m = re . match ( r ' https?://([^.]+) \ .bandcamp \ .com ' , url , re . I )
if m :
return m . group ( 1 ) . replace ( ' - ' , ' ' )
return m . group ( 1 ) . replace ( ' - ' , ' ' ) . title ( )
return ' '
_BAD_FIELD = re . compile (
r ' bandcamp|recordings?|productions?|distro| \ | ' , re . I
)
# Known review/blog domains — artist should never be the site name
_REVIEW_DOMAINS = re . compile (
r ' metalinjection \ .net|cvltnation \ .com|decibelmag \ .com|invisibleoranges \ .com| '
r ' brooklynvegan \ .com|heavyblogisheavy \ .com|angrymetalguy \ .com|sputnikmusic \ .com| '
r ' nocleansinging \ .com|meatmeadmetal \ .com|terrorizer \ .com|kerrang \ .com| '
r ' loudwire \ .com|nocturnalcult \ .com|themetalcrypt \ .com|rateyourmusic \ .com ' ,
re . I
)
def looks_like_bad_pick ( p : dict ) - > bool :
""" True if artist/album fields are clearly garbage. """
artist = p . get ( " artist " , " " )
album = p . get ( " album " , " " )
if not artist and not album :
return True
if _BAD_FIELD . search ( artist ) :
return True
if " | " in album :
return True
if len ( artist ) > 70 or len ( album ) > 120 :
return True
return False
# ── DEDUP ────────────────────────────────────────────────────────────────────
async def init_dedup_db ( ) :
@ -598,6 +653,55 @@ Return ONLY a JSON array starting with ["""
return [ ]
def sanitise_picks ( picks : list [ dict ] , source_candidates : list [ dict ] ) - > list [ dict ] :
""" Final cleanup pass: fix obscurity values, drop garbage artist labels. """
# Build URL → candidate map for fallback artist extraction
url_map = { c . get ( " url " , " " ) : c for c in source_candidates if c . get ( " url " ) }
clean = [ ]
for p in picks :
# Normalise obscurity to values the dashboard understands
obs = ( p . get ( " obscurity " ) or " " ) . lower ( )
if " underground " in obs or obs in ( " high " , " demo " , " diy " ) :
p [ " obscurity " ] = " high "
elif " indie " in obs or " cult " in obs or obs == " medium " :
p [ " obscurity " ] = " medium "
else :
p [ " obscurity " ] = " low "
# If URL is a review article, artist shouldn't be the site/blog name
url = p . get ( " url " , " " )
if _REVIEW_DOMAINS . search ( url ) :
domain_slug = re . sub ( r ' https?://(www \ .)? ' , ' ' , url ) . split ( ' / ' ) [ 0 ] . split ( ' . ' ) [ 0 ]
if domain_slug . lower ( ) in ( p . get ( " artist " ) or " " ) . lower ( ) . replace ( ' ' , ' ' ) :
# artist IS the domain — try to recover from prefilter data
orig = url_map . get ( url )
if orig and orig . get ( " artist " ) :
p [ " artist " ] = orig [ " artist " ]
p [ " album " ] = orig . get ( " album " ) or p . get ( " album " , " " )
else :
# Can't recover — drop this pick
continue
# If artist looks bad, try to recover from search result
if _BAD_FIELD . search ( p . get ( " artist " , " " ) ) :
orig = url_map . get ( p . get ( " url " , " " ) )
if orig :
artist_fallback , album_fallback = parse_title ( orig . get ( " title " , " " ) )
if not artist_fallback :
artist_fallback = artist_from_url ( orig . get ( " url " , " " ) )
p [ " artist " ] = artist_fallback or orig . get ( " artist " , p [ " artist " ] )
if not p . get ( " album " ) :
p [ " album " ] = album_fallback
if looks_like_bad_pick ( p ) :
log . debug ( f " Dropping bad pick after cleanup: { p . get ( ' artist ' ) } / { p . get ( ' album ' ) } " )
continue
clean . append ( p )
return clean
def extract_json_array ( text : str ) - > list :
""" Robustly extract a JSON array from LLM output. """
if not text :
@ -724,7 +828,8 @@ async def pipeline_run(run_id: str):
log_step ( " Curating picks with Mistral " )
picks = await curate_picks ( client , prefiltered , taste )
log_step ( f " { len ( picks ) } picks scored ≥50 " )
picks = sanitise_picks ( picks , scored )
log_step ( f " { len ( picks ) } picks after cleanup " )
if picks :
# Mark only actual picks as seen — unselected candidates stay eligible for re-eval