#!/usr/bin/env python3
import json
import os
import random
import re
import time
import urllib.request
from html import unescape

BASE_DIR = "/var/www/html"
SEED_FILE = os.path.join(BASE_DIR, "seeds", "songs.txt")
STATE_FILE = os.path.join(BASE_DIR, "storage", "seed_state.json")

SOURCES = [
    {
        "name": "apple_top_100_tr",
        "url": "https://music.apple.com/tr/playlist/top-100-t%C3%BCrkiye/pl.f3e0d6ef238542609572c18b0de1513b?l=tr",
    },
    {
        "name": "spotify_top_songs_tr",
        "url": "https://open.spotify.com/embed/playlist/37i9dQZEVXbJARRcHjHcAr?utm_source=generator",
    },
    {
        "name": "spotify_popular_tr",
        "url": "https://open.spotify.com/popular-in/tr",
    },
    {
        "name": "youtube_music_top_tr",
        "url": "https://music.youtube.com/playlist?list=PL4fGSI1pDJn5tdVDtIAZArERm_vv4uFCR",
    },
]

HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; AkorlarSeedBot/1.0; +https://akorlar.org)"
}

def fetch(url: str) -> str:
    req = urllib.request.Request(url, headers=HEADERS)
    with urllib.request.urlopen(req, timeout=25) as resp:
        return resp.read().decode("utf-8", errors="ignore")

def clean_text(text: str) -> str:
    text = unescape(text)
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def normalize_song_line(line: str) -> str:
    line = clean_text(line)
    line = line.replace("·", "-").replace("—", "-").replace("–", "-")
    line = re.sub(r"\s*-\s*", " - ", line)
    line = re.sub(r"\s+", " ", line).strip(" -")
    return line

def valid_song_line(line: str) -> bool:
    if " - " not in line:
        return False
    if len(line) < 6 or len(line) > 160:
        return False

    bad = [
        "playlist",
        "open in music",
        "preview",
        "featured artists",
        "your daily update",
        "trending songs in turkey",
        "top songs - turkey",
        "top 100: türkiye",
        "top 100 songs turkey",
        "music video",
        "official music video",
        "home new radio search",
    ]
    lower = line.lower()
    if any(b in lower for b in bad):
        return False

    # En az bir sanatçı ve bir şarkı tarafı olsun
    parts = line.split(" - ", 1)
    if len(parts) != 2:
        return False
    left, right = parts[0].strip(), parts[1].strip()
    if len(left) < 2 or len(right) < 2:
        return False

    # Çok numerik / anlamsız satırları azalt
    if re.fullmatch(r"[\d\s\W]+", line):
        return False

    return True

def extract_candidates(html: str) -> list[str]:
    text = clean_text(html)

    candidates = set()

    # Artist - Title
    for m in re.finditer(r"([A-Za-z0-9ÇĞİÖŞÜçğıöşü&'./,+() ]{2,80})\s[-–—]\s([A-Za-z0-9ÇĞİÖŞÜçğıöşü&'./,+()!? ]{2,100})", text):
        line = normalize_song_line(f"{m.group(1)} - {m.group(2)}")
        if valid_song_line(line):
            candidates.add(line)

    # Spotify embed tarzı: title ardından artist
    # Ör: "Ömrüm Eypio"
    words = text.split(" ")
    _ = words  # sadece okunabilirlik için

    # JSON içinden isim yakalama
    for m in re.finditer(r'"name"\s*:\s*"([^"]{2,120})"', html):
        item = clean_text(m.group(1))
        item = normalize_song_line(item)
        if valid_song_line(item):
            candidates.add(item)

    # Meta açıklamalardaki noktalı kalıplar
    for m in re.finditer(r"([A-Za-z0-9ÇĞİÖŞÜçğıöşü&'./,+() ]{2,80})\s\.\s([A-Za-z0-9ÇĞİÖŞÜçğıöşü&'./,+()!? ]{2,100})", text):
        line = normalize_song_line(f"{m.group(1)} - {m.group(2)}")
        if valid_song_line(line):
            candidates.add(line)

    return list(candidates)

def load_existing() -> list[str]:
    if not os.path.exists(SEED_FILE):
        return []
    with open(SEED_FILE, "r", encoding="utf-8") as f:
        return [x.strip() for x in f if x.strip()]

def save_seed(lines: list[str]) -> None:
    os.makedirs(os.path.dirname(SEED_FILE), exist_ok=True)
    with open(SEED_FILE, "w", encoding="utf-8") as f:
        for line in lines:
            f.write(line + "\n")

def load_state() -> dict:
    if not os.path.exists(STATE_FILE):
        return {}
    try:
        with open(STATE_FILE, "r", encoding="utf-8") as f:
            return json.load(f)
    except Exception:
        return {}

def save_state(state: dict) -> None:
    os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
    with open(STATE_FILE, "w", encoding="utf-8") as f:
        json.dump(state, f, ensure_ascii=False, indent=2)

def dedupe_keep_order(lines: list[str]) -> list[str]:
    seen = set()
    out = []
    for line in lines:
        key = line.casefold()
        if key not in seen:
            seen.add(key)
            out.append(line)
    return out

def main():
    existing = load_existing()
    state = load_state()

    collected = []
    for src in SOURCES:
        try:
            html = fetch(src["url"])
            candidates = extract_candidates(html)
            random.shuffle(candidates)
            collected.extend(candidates[:150])
            time.sleep(2)
        except Exception as e:
            print(f"[WARN] {src['name']} okunamadı: {e}")

    merged = dedupe_keep_order(existing + collected)

    # Çok saçma satırları eleyip 5000 üst sınır koy
    merged = [x for x in merged if valid_song_line(x)]
    merged = merged[:5000]

    save_seed(merged)

    state["last_run"] = int(time.time())
    state["total_seed_count"] = len(merged)
    save_state(state)

    print(f"Toplam seed sayısı: {len(merged)}")
    print(f"Yeni eklenen yaklaşık satır: {max(0, len(merged) - len(existing))}")

if __name__ == "__main__":
    main()
