trackpull/monochrome/spotify_to_ids.py

"""
Convert Spotify URLs to Monochrome/Tidal track IDs.

Usage:
    python spotify_to_ids.py <spotify_url> [<spotify_url>...] [-v] [--threshold N]

Supports track, album, and playlist URLs. Outputs one track ID per line (stdout).

Examples:
    python spotify_to_ids.py https://open.spotify.com/track/4PTG3Z6ehGkBFwjybzWkR8
    python spotify_to_ids.py -v https://open.spotify.com/album/4aawyAB9vmqN3uQ7FjRGTy
    python spotify_to_ids.py https://open.spotify.com/playlist/xxx | xargs -I{} python download.py {}
"""

import argparse
import json
import os
import random
import re
import sys
import time
import urllib.error
import urllib.parse
import urllib.request

from monochrome import fetch, fetch_json, discover_instances


# --- Spotify URL parsing ---

def parse_spotify_url(url):
    """Parse a Spotify URL into (type, id). Returns (None, None) on failure."""
    match = re.match(
        r'https?://open\.spotify\.com/(?:intl-\w+/)?(track|album|playlist)/([a-zA-Z0-9]+)',
        url.strip()
    )
    if not match:
        return None, None
    return match.group(1), match.group(2)


# --- Spotify metadata extraction ---

def fetch_spotify_embed(sp_type, sp_id):
    """Fetch Spotify embed page and extract __NEXT_DATA__ JSON."""
    url = f"https://open.spotify.com/embed/{sp_type}/{sp_id}"
    try:
        with fetch(url, timeout=15, use_ssl_ctx=False) as resp:
            html = resp.read().decode()
    except Exception as e:
        print(f"[!] Failed to fetch Spotify embed: {e}", file=sys.stderr)
        return None

    match = re.search(
        r'<script\s+id="__NEXT_DATA__"\s+type="application/json">\s*({.+?})\s*</script>',
        html, re.DOTALL
    )
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError:
            pass

    print("[!] __NEXT_DATA__ not found in embed page", file=sys.stderr)
    return None


def fetch_spotify_oembed(sp_type, sp_id):
    """Fallback: use oEmbed API to get at least a title string."""
    spotify_url = f"https://open.spotify.com/{sp_type}/{sp_id}"
    oembed_url = f"https://open.spotify.com/oembed?url={urllib.parse.quote(spotify_url, safe='')}"
    try:
        data = fetch_json(oembed_url, timeout=15, use_ssl_ctx=False)
        return data.get("title", "")
    except Exception:
        return None


def extract_collection_name(embed_data, sp_type):
    """Extract album/playlist name from __NEXT_DATA__ JSON. Returns None for single tracks."""
    if not embed_data or sp_type == "track":
        return None
    try:
        entity = embed_data["props"]["pageProps"]["state"]["data"]["entity"]
        return entity.get("name") or entity.get("title")
    except (KeyError, TypeError, IndexError):
        return None


def extract_tracks(embed_data, sp_type, sp_id):
    """Extract list of {title, artist} dicts from __NEXT_DATA__ JSON.
    Falls back to oEmbed if embed data is missing or malformed."""
    if embed_data:
        try:
            entity = embed_data["props"]["pageProps"]["state"]["data"]["entity"]

            if sp_type == "track":
                title = entity.get("name") or entity.get("title", "")
                artists = entity.get("artists")
                if artists and isinstance(artists, list):
                    artist = artists[0].get("name", "")
                else:
                    artist = entity.get("subtitle", "")
                if title:
                    return [{"title": title, "artist": artist}]

            elif sp_type in ("album", "playlist"):
                track_list = entity.get("trackList", [])
                if track_list:
                    tracks = []
                    for t in track_list:
                        title = t.get("title", "")
                        artist = t.get("subtitle", "")
                        if title:
                            tracks.append({"title": title, "artist": artist})
                    if tracks:
                        return tracks
        except (KeyError, TypeError, IndexError):
            pass

    # Fallback: oEmbed (single tracks only, limited data)
    if sp_type == "track":
        oembed_title = fetch_spotify_oembed(sp_type, sp_id)
        if oembed_title:
            print(f'[*] Using oEmbed fallback: "{oembed_title}"', file=sys.stderr)
            return [{"title": oembed_title, "artist": ""}]

    return []


# --- Fuzzy matching ---

def normalize(text):
    """Normalize text for comparison: lowercase, strip feat/remaster/punctuation."""
    text = text.lower()
    text = re.sub(r'\(feat\.?[^)]*\)', '', text)
    text = re.sub(r'\(ft\.?[^)]*\)', '', text)
    text = re.sub(r'\(remaster(ed)?\)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'[^\w\s]', ' ', text)
    return ' '.join(text.split())


def similarity(a, b):
    """Token overlap ratio (Jaccard index)."""
    tokens_a = set(normalize(a).split())
    tokens_b = set(normalize(b).split())
    if not tokens_a or not tokens_b:
        return 0.0
    return len(tokens_a & tokens_b) / len(tokens_a | tokens_b)


def find_best_match(results, target_title, target_artist, threshold=0.4):
    """Find the best matching track from Monochrome search results."""
    best = None
    best_score = 0

    for r in results:
        r_title = r.get("title", "")
        r_artist_obj = r.get("artist", {})
        if isinstance(r_artist_obj, dict):
            r_artist = r_artist_obj.get("name", "")
        else:
            r_artist = str(r_artist_obj)

        title_sim = similarity(target_title, r_title)
        artist_sim = similarity(target_artist, r_artist) if target_artist else 0.5
        score = 0.6 * title_sim + 0.4 * artist_sim

        if score > best_score:
            best_score = score
            best = r

    if best and best_score >= threshold:
        return best, best_score
    return None, 0


# --- Monochrome search ---

def search_monochrome(instances, query, log=None):
    """Search Monochrome instances for tracks matching a query string."""
    if log is None:
        log = print
    shuffled = list(instances)
    random.shuffle(shuffled)
    encoded = urllib.parse.quote(query)

    for base in shuffled:
        url = f"{base}/search/?s={encoded}"
        try:
            data = fetch_json(url, timeout=15)
            if isinstance(data, dict) and "data" in data and "version" in data:
                data = data["data"]
            if isinstance(data, dict) and "items" in data:
                return data["items"]
            if isinstance(data, list):
                return data
            if isinstance(data, dict) and "tracks" in data:
                return data["tracks"]
        except Exception:
            continue
    return []


# --- Main ---

def main():
    parser = argparse.ArgumentParser(
        description="Convert Spotify URLs to Monochrome/Tidal track IDs"
    )
    parser.add_argument("urls", nargs="+", help="Spotify track/album/playlist URLs")
    parser.add_argument("-v", "--verbose", action="store_true",
                        help="Show matched title/artist alongside IDs")
    parser.add_argument("--threshold", type=float, default=0.4,
                        help="Minimum match score 0-1 (default: 0.4)")
    args = parser.parse_args()

    instances = discover_instances()
    found = 0
    missed = 0

    for url in args.urls:
        sp_type, sp_id = parse_spotify_url(url)
        if not sp_type:
            print(f"[!] Invalid Spotify URL: {url}", file=sys.stderr)
            continue

        print(f"[*] Fetching Spotify {sp_type}: {sp_id}", file=sys.stderr)
        embed_data = fetch_spotify_embed(sp_type, sp_id)
        tracks = extract_tracks(embed_data, sp_type, sp_id)

        if not tracks:
            print(f"[!] Could not extract tracks from {url}", file=sys.stderr)
            continue

        print(f"[*] Found {len(tracks)} track(s) on Spotify", file=sys.stderr)

        for i, track in enumerate(tracks):
            query = f"{track['artist']} {track['title']}".strip()
            print(f"[*] Searching: {query}", file=sys.stderr)

            results = search_monochrome(instances, query)
            match, score = find_best_match(results, track["title"], track["artist"], args.threshold)

            if match:
                tid = match.get("id")
                found += 1
                if args.verbose:
                    m_title = match.get("title", "?")
                    m_artist_obj = match.get("artist", {})
                    m_artist = m_artist_obj.get("name", "?") if isinstance(m_artist_obj, dict) else str(m_artist_obj)
                    print(f"{tid}\t{m_artist} - {m_title}\t(score: {score:.2f})")
                else:
                    print(tid)
            else:
                missed += 1
                print(f"[!] No match: {track['artist']} - {track['title']}", file=sys.stderr)

            # Rate limit delay between searches (skip after last track)
            if i < len(tracks) - 1:
                time.sleep(0.5)

    print(f"\n[*] Done: {found} matched, {missed} missed", file=sys.stderr)


if __name__ == "__main__":
    # Allow running as standalone script
    sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    main()