feat: implemented Monochrome downloading

2026-03-08 20:14:30 +01:00
parent 6dff83ac61
commit f4dee850f3
8 changed files with 1157 additions and 9 deletions
--- a/monochrome/spotify_to_ids.py
+++ b/monochrome/spotify_to_ids.py
@@ -0,0 +1,269 @@
+"""
+Convert Spotify URLs to Monochrome/Tidal track IDs.
+
+Usage:
+    python spotify_to_ids.py <spotify_url> [<spotify_url>...] [-v] [--threshold N]
+
+Supports track, album, and playlist URLs. Outputs one track ID per line (stdout).
+
+Examples:
+    python spotify_to_ids.py https://open.spotify.com/track/4PTG3Z6ehGkBFwjybzWkR8
+    python spotify_to_ids.py -v https://open.spotify.com/album/4aawyAB9vmqN3uQ7FjRGTy
+    python spotify_to_ids.py https://open.spotify.com/playlist/xxx | xargs -I{} python download.py {}
+"""
+
+import argparse
+import json
+import os
+import random
+import re
+import sys
+import time
+import urllib.error
+import urllib.parse
+import urllib.request
+
+from monochrome import fetch, fetch_json, discover_instances
+
+
+# --- Spotify URL parsing ---
+
+def parse_spotify_url(url):
+    """Parse a Spotify URL into (type, id). Returns (None, None) on failure."""
+    match = re.match(
+        r'https?://open\.spotify\.com/(?:intl-\w+/)?(track|album|playlist)/([a-zA-Z0-9]+)',
+        url.strip()
+    )
+    if not match:
+        return None, None
+    return match.group(1), match.group(2)
+
+
+# --- Spotify metadata extraction ---
+
+def fetch_spotify_embed(sp_type, sp_id):
+    """Fetch Spotify embed page and extract __NEXT_DATA__ JSON."""
+    url = f"https://open.spotify.com/embed/{sp_type}/{sp_id}"
+    try:
+        with fetch(url, timeout=15, use_ssl_ctx=False) as resp:
+            html = resp.read().decode()
+    except Exception as e:
+        print(f"[!] Failed to fetch Spotify embed: {e}", file=sys.stderr)
+        return None
+
+    match = re.search(
+        r'<script\s+id="__NEXT_DATA__"\s+type="application/json">\s*({.+?})\s*</script>',
+        html, re.DOTALL
+    )
+    if match:
+        try:
+            return json.loads(match.group(1))
+        except json.JSONDecodeError:
+            pass
+
+    print("[!] __NEXT_DATA__ not found in embed page", file=sys.stderr)
+    return None
+
+
+def fetch_spotify_oembed(sp_type, sp_id):
+    """Fallback: use oEmbed API to get at least a title string."""
+    spotify_url = f"https://open.spotify.com/{sp_type}/{sp_id}"
+    oembed_url = f"https://open.spotify.com/oembed?url={urllib.parse.quote(spotify_url, safe='')}"
+    try:
+        data = fetch_json(oembed_url, timeout=15, use_ssl_ctx=False)
+        return data.get("title", "")
+    except Exception:
+        return None
+
+
+def extract_collection_name(embed_data, sp_type):
+    """Extract album/playlist name from __NEXT_DATA__ JSON. Returns None for single tracks."""
+    if not embed_data or sp_type == "track":
+        return None
+    try:
+        entity = embed_data["props"]["pageProps"]["state"]["data"]["entity"]
+        return entity.get("name") or entity.get("title")
+    except (KeyError, TypeError, IndexError):
+        return None
+
+
+def extract_tracks(embed_data, sp_type, sp_id):
+    """Extract list of {title, artist} dicts from __NEXT_DATA__ JSON.
+    Falls back to oEmbed if embed data is missing or malformed."""
+    if embed_data:
+        try:
+            entity = embed_data["props"]["pageProps"]["state"]["data"]["entity"]
+
+            if sp_type == "track":
+                title = entity.get("name") or entity.get("title", "")
+                artists = entity.get("artists")
+                if artists and isinstance(artists, list):
+                    artist = artists[0].get("name", "")
+                else:
+                    artist = entity.get("subtitle", "")
+                if title:
+                    return [{"title": title, "artist": artist}]
+
+            elif sp_type in ("album", "playlist"):
+                track_list = entity.get("trackList", [])
+                if track_list:
+                    tracks = []
+                    for t in track_list:
+                        title = t.get("title", "")
+                        artist = t.get("subtitle", "")
+                        if title:
+                            tracks.append({"title": title, "artist": artist})
+                    if tracks:
+                        return tracks
+        except (KeyError, TypeError, IndexError):
+            pass
+
+    # Fallback: oEmbed (single tracks only, limited data)
+    if sp_type == "track":
+        oembed_title = fetch_spotify_oembed(sp_type, sp_id)
+        if oembed_title:
+            print(f'[*] Using oEmbed fallback: "{oembed_title}"', file=sys.stderr)
+            return [{"title": oembed_title, "artist": ""}]
+
+    return []
+
+
+# --- Fuzzy matching ---
+
+def normalize(text):
+    """Normalize text for comparison: lowercase, strip feat/remaster/punctuation."""
+    text = text.lower()
+    text = re.sub(r'\(feat\.?[^)]*\)', '', text)
+    text = re.sub(r'\(ft\.?[^)]*\)', '', text)
+    text = re.sub(r'\(remaster(ed)?\)', '', text, flags=re.IGNORECASE)
+    text = re.sub(r'[^\w\s]', ' ', text)
+    return ' '.join(text.split())
+
+
+def similarity(a, b):
+    """Token overlap ratio (Jaccard index)."""
+    tokens_a = set(normalize(a).split())
+    tokens_b = set(normalize(b).split())
+    if not tokens_a or not tokens_b:
+        return 0.0
+    return len(tokens_a & tokens_b) / len(tokens_a | tokens_b)
+
+
+def find_best_match(results, target_title, target_artist, threshold=0.4):
+    """Find the best matching track from Monochrome search results."""
+    best = None
+    best_score = 0
+
+    for r in results:
+        r_title = r.get("title", "")
+        r_artist_obj = r.get("artist", {})
+        if isinstance(r_artist_obj, dict):
+            r_artist = r_artist_obj.get("name", "")
+        else:
+            r_artist = str(r_artist_obj)
+
+        title_sim = similarity(target_title, r_title)
+        artist_sim = similarity(target_artist, r_artist) if target_artist else 0.5
+        score = 0.6 * title_sim + 0.4 * artist_sim
+
+        if score > best_score:
+            best_score = score
+            best = r
+
+    if best and best_score >= threshold:
+        return best, best_score
+    return None, 0
+
+
+# --- Monochrome search ---
+
+def search_monochrome(instances, query, log=None):
+    """Search Monochrome instances for tracks matching a query string."""
+    if log is None:
+        log = print
+    shuffled = list(instances)
+    random.shuffle(shuffled)
+    encoded = urllib.parse.quote(query)
+
+    for base in shuffled:
+        url = f"{base}/search/?s={encoded}"
+        try:
+            data = fetch_json(url, timeout=15)
+            if isinstance(data, dict) and "data" in data and "version" in data:
+                data = data["data"]
+            if isinstance(data, dict) and "items" in data:
+                return data["items"]
+            if isinstance(data, list):
+                return data
+            if isinstance(data, dict) and "tracks" in data:
+                return data["tracks"]
+        except Exception:
+            continue
+    return []
+
+
+# --- Main ---
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert Spotify URLs to Monochrome/Tidal track IDs"
+    )
+    parser.add_argument("urls", nargs="+", help="Spotify track/album/playlist URLs")
+    parser.add_argument("-v", "--verbose", action="store_true",
+                        help="Show matched title/artist alongside IDs")
+    parser.add_argument("--threshold", type=float, default=0.4,
+                        help="Minimum match score 0-1 (default: 0.4)")
+    args = parser.parse_args()
+
+    instances = discover_instances()
+    found = 0
+    missed = 0
+
+    for url in args.urls:
+        sp_type, sp_id = parse_spotify_url(url)
+        if not sp_type:
+            print(f"[!] Invalid Spotify URL: {url}", file=sys.stderr)
+            continue
+
+        print(f"[*] Fetching Spotify {sp_type}: {sp_id}", file=sys.stderr)
+        embed_data = fetch_spotify_embed(sp_type, sp_id)
+        tracks = extract_tracks(embed_data, sp_type, sp_id)
+
+        if not tracks:
+            print(f"[!] Could not extract tracks from {url}", file=sys.stderr)
+            continue
+
+        print(f"[*] Found {len(tracks)} track(s) on Spotify", file=sys.stderr)
+
+        for i, track in enumerate(tracks):
+            query = f"{track['artist']} {track['title']}".strip()
+            print(f"[*] Searching: {query}", file=sys.stderr)
+
+            results = search_monochrome(instances, query)
+            match, score = find_best_match(results, track["title"], track["artist"], args.threshold)
+
+            if match:
+                tid = match.get("id")
+                found += 1
+                if args.verbose:
+                    m_title = match.get("title", "?")
+                    m_artist_obj = match.get("artist", {})
+                    m_artist = m_artist_obj.get("name", "?") if isinstance(m_artist_obj, dict) else str(m_artist_obj)
+                    print(f"{tid}\t{m_artist} - {m_title}\t(score: {score:.2f})")
+                else:
+                    print(tid)
+            else:
+                missed += 1
+                print(f"[!] No match: {track['artist']} - {track['title']}", file=sys.stderr)
+
+            # Rate limit delay between searches (skip after last track)
+            if i < len(tracks) - 1:
+                time.sleep(0.5)
+
+    print(f"\n[*] Done: {found} matched, {missed} missed", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    # Allow running as standalone script
+    sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    main()