@ -49,62 +49,120 @@ settings = Settings()
# ── SEARXNG QUERY BANK ───────────────────────────────────────────────────────
# All 29 queries — dynamic year, rotating subgenre index
SUBGENRES = [
" death metal " , " black metal " , " doom metal " , " thrash metal " , " sludge metal " ,
" progressive metal " , " blackened death metal " , " post-metal " , " funeral doom " , " grindcore " ,
" melodic death metal " , " technical death metal " , " atmospheric black metal " , " war metal " ,
" drone metal " , " stoner metal " , " gothic metal " , " power metal " , " viking metal " , " brutal death metal " ,
" deathcore " , " mathcore " , " noise rock " , " crust punk " , " black doom " , " death doom " ,
" dissonant death metal " , " ambient black metal " , " depressive black metal " , " crossover thrash " ,
]
# Labels to search on Bandcamp — no time_range, broad results
BANDCAMP_LABELS = [
" sentient ruin " ,
" 20 buck spin " ,
" prosthetic records " ,
" unique leader records " ,
" redefining darkness " ,
" profound lore " ,
" season of mist " ,
" century media " ,
" nuclear blast " ,
" relapse records " ,
" metal blade records " ,
" dark descent records " ,
" iron bonehead " ,
" les acteurs de l ' ombre " ,
" vrasubatlat " ,
" me saco un ojo " ,
" hells headbangers " ,
" invictus productions " ,
" blood harvest " ,
" chaos records " ,
" adagio 830 " ,
" floga records " ,
" sewer rot records " ,
" lavadome productions " ,
" memento mori " ,
]
# Review/news sites — use time_range year to get fresh but not stale
REVIEW_SITES = [
" metalinjection.net " ,
" cvltnation.com " ,
" decibelmag.com " ,
" invisibleoranges.com " ,
" brooklynvegan.com " ,
" heavyblogisheavy.com " ,
" nocturnalcult.com " ,
" themetalcrypt.com " ,
" angrymetalguy.com " ,
" sputnikmusic.com " ,
" nocleansinging.com " ,
" rateyourmusic.com " ,
" terrorizer.com " ,
" kerrang.com " ,
]
def build_queries ( ) - > list [ str ] :
year = datetime . now ( ) . year
# rotate subgenre every 6 hours so successive runs hit different niches
idx = int ( time . time ( ) / / 21600 ) % len ( SUBGENRES )
sg = SUBGENRES [ idx ]
sg2 = SUBGENRES [ ( idx + 1 ) % len ( SUBGENRES ) ]
sg3 = SUBGENRES [ ( idx + 2 ) % len ( SUBGENRES ) ]
return [
# Subgenre new releases
f ' { sg } new album release { year } ' ,
f ' { sg2 } new album { year } ' ,
f ' { sg3 } full album stream { year } ' ,
# Bandcamp label searches
f ' site:bandcamp.com " sentient ruin " new { year } ' ,
f ' site:bandcamp.com " 20 buck spin " new { year } ' ,
f ' site:bandcamp.com " prosthetic records " { year } ' ,
f ' site:bandcamp.com " unique leader records " { year } ' ,
f ' site:bandcamp.com " redefining darkness " { year } ' ,
f ' site:bandcamp.com " profound lore " { year } ' ,
f ' site:bandcamp.com " season of mist " { year } ' ,
f ' site:bandcamp.com " century media " { year } ' ,
f ' site:bandcamp.com " nuclear blast " { year } ' ,
# Bandcamp genre pages
f ' site:bandcamp.com { sg } album { year } ' ,
f ' site:bandcamp.com { sg2 } album { year } ' ,
# Metal reviews / coverage
f ' site:metal-archives.com new album review { year } ' ,
f ' site:metalinjection.net new album { year } ' ,
f ' site:cvltnation.com new album { year } ' ,
f ' site:decibelmag.com new album stream { year } ' ,
f ' site:invisibleoranges.com album review { year } ' ,
f ' site:brooklynvegan.com { sg } new album { year } ' ,
# Obscure/underground
f ' underground { sg } demo release { year } ' ,
f ' independent { sg } album bandcamp { year } ' ,
f ' new { sg } EP release { year } ' ,
# Michigan / regional bias
f ' Michigan metal band new album { year } ' ,
f ' Detroit metal band new release { year } ' ,
# Best-of / roundup (useful for discovery)
f ' best { sg } albums { year } ' ,
f ' new { sg } releases { year } bandcamp ' ,
# Last.fm tag pages
f ' site:last.fm { sg } new { year } ' ,
# Full-album YouTube
f ' " { sg } " " full album " { year } new band ' ,
]
# Rotate through 6 subgenres per run (changes every 4 hours)
idx = int ( time . time ( ) / / 14400 ) % len ( SUBGENRES )
sgs = [ SUBGENRES [ ( idx + i ) % len ( SUBGENRES ) ] for i in range ( 6 ) ]
# Rotate through label batches (every 6 hours, 5 labels at a time)
label_idx = int ( time . time ( ) / / 21600 ) % len ( BANDCAMP_LABELS )
labels = [ BANDCAMP_LABELS [ ( label_idx + i ) % len ( BANDCAMP_LABELS ) ] for i in range ( 5 ) ]
# Rotate review sites (every 12 hours, 4 sites at a time)
site_idx = int ( time . time ( ) / / 43200 ) % len ( REVIEW_SITES )
sites = [ REVIEW_SITES [ ( site_idx + i ) % len ( REVIEW_SITES ) ] for i in range ( 4 ) ]
queries = [ ]
# Subgenre releases — mix of year-scoped and unscoped for breadth
for sg in sgs [ : 3 ] :
queries . append ( f ' { sg } new album release { year } ' )
for sg in sgs [ 3 : ] :
queries . append ( f ' { sg } new album bandcamp ' )
# Bandcamp direct — no time_range, catches more
for sg in sgs [ : 2 ] :
queries . append ( f ' site:bandcamp.com { sg } album { year } ' )
queries . append ( f ' site:bandcamp.com { sg } new release ' )
# Label searches — broad, no time restriction
for label in labels :
queries . append ( f ' site:bandcamp.com " { label } " ' )
# Review sites with year scope
for site in sites :
queries . append ( f ' site: { site } album review { year } ' )
queries . append ( f ' site: { site } new album { year } ' )
# Bandcamp underground / demo scene
for sg in sgs [ : 2 ] :
queries . append ( f ' underground { sg } demo { year } bandcamp ' )
queries . append ( f ' new { sg } EP { year } bandcamp ' )
# Michigan / regional
queries . append ( f ' Michigan metal band new album { year } ' )
queries . append ( f ' Detroit metal band new release { year } ' )
queries . append ( f ' Midwest metal new release { year } bandcamp ' )
# Metal Archives new reviews (no time_range — MA is evergreen)
queries . append ( f ' site:metal-archives.com album review { year } ' )
queries . append ( f ' site:metal-archives.com new band { year } ' )
# Broad discovery
queries . append ( f ' best underground metal albums { year } ' )
queries . append ( f ' new metal releases { year } bandcamp full stream ' )
queries . append ( f ' hidden gem metal album { year } ' )
queries . append ( f ' obscure metal release { year } ' )
queries . append ( f ' metal full album stream { year } new band ' )
return queries
# ── NOISE FILTERS ────────────────────────────────────────────────────────────
@ -255,16 +313,22 @@ async def search_candidates(client: httpx.AsyncClient) -> list[dict]:
log . info ( f " Running { len ( queries ) } SearXNG queries " )
async def one_query ( q : str ) - > list [ dict ] :
# Use year time_range only for queries that include the current year literal;
# skip time_range for Bandcamp label/genre queries to maximise breadth
use_time_range = " site:bandcamp.com " not in q and " metal-archives " not in q
params : dict = { " q " : q , " format " : " json " }
if use_time_range :
params [ " time_range " ] = " year "
try :
r = await client . get (
f " { settings . searxng_url } /search " ,
params = { " q " : q , " format " : " json " , " time_range " : " month " } ,
timeout = 20 ,
params = params ,
timeout = 2 5 ,
)
if r . status_code != 200 :
return [ ]
data = r . json ( )
results = data . get ( " results " , [ ] ) [ : 15] # top 1 5 per query
results = data . get ( " results " , [ ] ) [ : 25] # top 2 5 per query
out = [ ]
for res in results :
title = res . get ( " title " , " " )
@ -363,56 +427,41 @@ async def score_candidates(
return scored
async def prefilter_ candidates (
client : httpx . AsyncClient , candidates : list [ dict ]
async def prefilter_ batch (
client : httpx . AsyncClient , batch : list [ dict ] , batch_offset : int
) - > list [ dict ] :
""" Use mistral-nemo to prefilter — keep actual album releases, drop noise. """
if not candidates :
return [ ]
# Send in batches of 40
batch = candidates [ : 80 ]
""" Run prefilter on one batch. batch_offset adjusts index for provenance mapping. """
items_for_prompt = [
{ " index " : i , " artist " : c . get ( " artist " , " " ) , " album " : c . get ( " album " , " " ) , " url " : c . get ( " url " , " " ) }
for i , c in enumerate ( batch )
]
prompt = f """ You are a metal music expert. Review these { len ( items_for_prompt ) } items from web searches.
Return ONLY a JSON array of items that are actual metal album or EP releases ( not news articles , not Wikipedia , not lists ) .
For each item kept , include : index ( integer ) , artist ( string ) , album ( string ) , subgenre ( string ) , confidence ( 0.0 - 1.0 ) .
Return ONLY a JSON array of items that are actual metal album or EP releases ( not news , not Wikipedia , not lists , not tour dates ) .
For each kept item include : index ( integer ) , artist ( string ) , album ( string ) , subgenre ( string ) , confidence ( 0.0 - 1.0 ) .
Be inclusive — keep anything that looks like it could be a real release even if you ' re not certain.
Items :
{ json . dumps ( items_for_prompt , indent = 2 ) }
Return ONLY valid JSON array starting with [ """
body = {
" model " : " mistral-nemo:latest " ,
" messages " : [ { " role " : " user " , " content " : prompt } ] ,
" stream " : False ,
" options " : { " temperature " : 0.1 , " num_predict " : 4 000} ,
" options " : { " temperature " : 0.1 , " num_predict " : 6 000} ,
}
try :
r = await client . post (
f " { settings . ollama_url } /api/chat " ,
json = body ,
timeout = 120 ,
)
r = await client . post ( f " { settings . ollama_url } /api/chat " , json = body , timeout = 150 )
raw = r . json ( ) . get ( " message " , { } ) . get ( " content " , " " ) if r . status_code == 200 else " "
except Exception as e :
log . error( f " Prefilter failed: { e } " )
return candidates[ : 40 ] # fall through with top candidates
log . warning ( f " Prefilter batch failed: { e } " )
return batch # pass through on error
# Extract JSON from response
parsed = extract_json_array ( raw )
if not parsed :
log . warning ( " Prefilter returned no parseable JSON — using all candidates " )
return candidates [ : 40 ]
log . info ( f " Prefilter kept { len ( parsed ) } / { len ( batch ) } candidates " )
log . warning ( f " Prefilter batch { batch_offset } : no JSON — passing through " )
return batch
# Map back to full candidate objects with prefilter data
enriched = [ ]
for p in parsed :
idx = p . get ( " index " )
@ -426,10 +475,35 @@ Return ONLY valid JSON array starting with ["""
" confidence " : p . get ( " confidence " , 0.5 ) ,
}
enriched . append ( orig )
return enriched
async def prefilter_candidates (
client : httpx . AsyncClient , candidates : list [ dict ]
) - > list [ dict ] :
""" Run prefilter on all candidates in parallel batches of 35. """
if not candidates :
return [ ]
BATCH_SIZE = 35
batches = [ candidates [ i : i + BATCH_SIZE ] for i in range ( 0 , min ( len ( candidates ) , 200 ) , BATCH_SIZE ) ]
log . info ( f " Prefiltering { len ( candidates ) } candidates in { len ( batches ) } parallel batches " )
# Run up to 3 batches concurrently to avoid OOMing Ollama
all_enriched = [ ]
for chunk_start in range ( 0 , len ( batches ) , 3 ) :
chunk = batches [ chunk_start : chunk_start + 3 ]
results = await asyncio . gather ( * [
prefilter_batch ( client , b , chunk_start * BATCH_SIZE + i * BATCH_SIZE )
for i , b in enumerate ( chunk )
] )
for r in results :
all_enriched . extend ( r )
log . info ( f " Prefilter kept { len ( all_enriched ) } / { len ( candidates ) } candidates " )
return all_enriched
async def curate_picks (
client : httpx . AsyncClient , candidates : list [ dict ] , taste : dict
) - > list [ dict ] :
@ -437,7 +511,7 @@ async def curate_picks(
if not candidates :
return [ ]
top = sorted ( candidates , key = lambda x : - x . get ( " embedScore " , 0 ) ) [ : 2 0]
top = sorted ( candidates , key = lambda x : - x . get ( " embedScore " , 0 ) ) [ : 3 0]
top_artists = [ a [ " name " ] for a in taste . get ( " topArtists " , [ ] ) [ : 20 ] ]
recent_artists = [ a [ " name " ] for a in taste . get ( " recentArtists " , [ ] ) [ : 10 ] ]
year = datetime . now ( ) . year
@ -497,19 +571,19 @@ Return ONLY a JSON array starting with ["""
raw = r . json ( ) . get ( " message " , { } ) . get ( " content " , " " ) if r . status_code == 200 else " "
picks = extract_json_array ( raw )
if picks :
# Validate provenance: url must match a search result OR index must be valid
# Validate provenance: keep picks that reference a real input item
validated = [ ]
for p in picks :
if p . get ( " score " , 0 ) < 6 0:
if p . get ( " score " , 0 ) < 5 0:
continue
idx = p . get ( " index " )
url = p . get ( " url " , " " )
if idx is not None and idx in top_by_index :
# Merge URL from original candidate if LLM dropped it
if not url :
p [ " url " ] = top_by_index [ idx ] . get ( " url " , " " )
# Always use original URL from search result
p [ " url " ] = top_by_index [ idx ] . get ( " url " , url )
validated . append ( p )
elif url and url in known_urls :
elif url and any ( url . startswith ( ku [ : 40 ] ) for ku in known_urls if ku ) :
# URL prefix match (handles trailing-slash variants)
validated . append ( p )
else :
log . debug ( f " Dropped hallucinated pick: { p . get ( ' artist ' ) } — { p . get ( ' album ' ) } " )
@ -650,11 +724,11 @@ async def pipeline_run(run_id: str):
log_step ( " Curating picks with Mistral " )
picks = await curate_picks ( client , prefiltered , taste )
log_step ( f " { len ( picks ) } picks scored ≥ 6 0" )
log_step ( f " { len ( picks ) } picks scored ≥ 5 0" )
if picks :
# Mark all prefiltered candidates as seen (not just picks) to avoid re-processing
await mark_seen ( p refiltered )
# Mark only actual picks as seen — unselected candidates stay eligible for re-eval
await mark_seen ( p icks )
await write_picks ( picks , {
" candidates " : len ( candidates ) ,
" filtered " : len ( prefiltered ) ,