#!/usr/bin/env python3
import os, re, sys, time, json, signal, asyncio, aiohttp
from urllib.parse import urljoin, urlparse

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DELETE_DIR = os.path.join(BASE_DIR, "delete")
os.makedirs(DELETE_DIR, exist_ok=True)

INPUT_FILE = os.path.join(BASE_DIR, "urls.txt")
OUTPUT_FILE = os.path.join(BASE_DIR, "vysledky.txt")
STATE_FILE = os.path.join(DELETE_DIR, "state.json")
LOG_FILE = os.path.join(DELETE_DIR, "log.txt")
PID_FILE = os.path.join(DELETE_DIR, "daemon.pid")

CONTACT_KW = ["contact", "team", "about", "members"]

HEADERS = {"User-Agent": "Mozilla/5.0"}

EMAIL_REGEX = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]+")
PHONE_REGEX = re.compile(r"(?:\+?\d{1,3}[\s.-]?)?(?:\(?\d{2,4}\)?[\s.-]?){2,4}\d{2,4}")

# ---------- LOG ----------
def log(msg):
    with open(LOG_FILE, "a", encoding="utf-8") as f:
        f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} | {msg}\n")

# ---------- STATE ----------
def save_state(i):
    with open(STATE_FILE, "w") as f:
        json.dump({"index": i}, f)

def load_state():
    if os.path.exists(STATE_FILE):
        return json.load(open(STATE_FILE)).get("index", 0)
    return 0

# ---------- HTML CLEAN ----------
def clean_html(html):
    html = re.sub(r"<script.*?>.*?</script>", " ", html, flags=re.S|re.I)
    html = re.sub(r"<style.*?>.*?</style>", " ", html, flags=re.S|re.I)
    html = re.sub(r"<svg.*?>.*?</svg>", " ", html, flags=re.S|re.I)
    return html

# ---------- META ----------
def extract_meta(html):
    title = re.search(r"<title>(.*?)</title>", html, re.I)
    desc = re.search(r'name=["\']description["\'][^>]+content=["\'](.*?)["\']', html, re.I)
    kw = re.search(r'name=["\']keywords["\'][^>]+content=["\'](.*?)["\']', html, re.I)
    return (
        title.group(1).strip() if title else "",
        kw.group(1).strip() if kw else "",
        desc.group(1).strip() if desc else ""
    )

# ---------- CONTACT LINKS ----------
def find_contacts(html, base):
    links = re.findall(r'href=["\'](.*?)["\']', html, re.I)
    out = []
    for l in links:
        low = l.lower()
        if any(k in low for k in CONTACT_KW):
            if not low.endswith((".css",".js",".png",".jpg",".svg",".pdf")):
                out.append(urljoin(base, l))
    return list(set(out))

# ---------- FALLBACK ----------
def fallback_urls(url):
    p = urlparse(url)
    base = f"{p.scheme}://{p.netloc}"
    return [base+x for x in [
        "/kontakt","/contact","/contact-us","/impressum","/about","/kontakty"
    ]]

# ---------- EMAIL ----------
def clean_emails(emails):
    out = set()
    for e in emails:
        e = e.strip()
        if not e.lower().endswith((".jpg",".png",".jpeg",".gif",".svg")):
            out.add(e)
    return out

def extract_emails(html):
    emails = set(EMAIL_REGEX.findall(html))

    mails = re.findall(r"mailto:([^\"'>]+)", html, re.I)
    emails.update(mails)

    html2 = html.lower().replace(" at ", "@").replace(" dot ", ".")
    emails.update(EMAIL_REGEX.findall(html2))

    return clean_emails(emails)

# ---------- PHONE ----------
def clean_phones(raw):
    out = set()
    for p in raw:
        p = p.strip()

        # musí začínat + nebo 0
        if not (p.startswith("+") or p.startswith("0")):
            continue

        digits = re.sub(r"\D","",p)

        if 9 <= len(digits) <= 15:
            out.add(p)

    return list(out)[:5]  # max 5

def extract_phones(html):
    return clean_phones(set(PHONE_REGEX.findall(html)))

# ---------- FILTER ----------
def bad_domain(url):
    return any(x in url for x in ["zugradar","geoview","wikipedia","booking","map"])

def is_aggregator(html):
    text = html.lower()
    bad = [
        "top fachbetriebe","alle firmen","verzeichnis",
        "angebote vergleichen","bewertungen","hotelübersicht"
    ]
    score = sum(1 for b in bad if b in text)
    return score >= 2

# ---------- FETCH ----------
async def fetch(session,url):
    try:
        async with session.get(url,timeout=20) as r:
            return await r.text(errors="ignore")
    except Exception as e:
        log(f"FETCH_FAIL | {url} | {repr(e)}")
        return ""

# ---------- PROCESS ----------
async def process(session,url):
    log(f"PROCESS_START | {url}")

    if bad_domain(url):
        log(f"SKIP_BAD_DOMAIN | {url}")
        return None

    html = await fetch(session,url)
    if not html:
        return None

    html = clean_html(html)

    if is_aggregator(html):
        log(f"SKIP_AGGREGATOR | {url}")
        return None

    title,kw,desc = extract_meta(html)
    emails = extract_emails(html)
    phones = extract_phones(html)

    contacts = find_contacts(html,url)

    for c in contacts[:3]:
        h = await fetch(session,c)
        if h:
            h = clean_html(h)
            emails |= extract_emails(h)
            phones += extract_phones(h)

    for f in fallback_urls(url):
        if f not in contacts:
            h = await fetch(session,f)
            if h:
                h = clean_html(h)
                e2 = extract_emails(h)
                p2 = extract_phones(h)
                if e2 or p2:
                    log(f"FALLBACK_HIT | {f} | e:{len(e2)} p:{len(p2)}")
                emails |= e2
                phones += p2

    phones = list(set(phones))[:5]

    log(f"PROCESS_DONE | {url} | emails:{len(emails)} phones:{len(phones)}")

    return "|".join([
        url,
        title,
        kw,
        desc,
        ",".join(contacts),
        ",".join(emails),
        ",".join(phones)
    ])

# ---------- WORKER ----------
async def worker():
    i = load_state()
    log(f"RESUME_FROM | {i}")

    urls = [x.strip() for x in open(INPUT_FILE,encoding="utf-8") if x.strip()]

    buf = []
    last = time.time()

    async with aiohttp.ClientSession(headers=HEADERS) as session:
        for idx in range(i,len(urls)):
            url = urls[idx]

            line = await process(session,url)
            if line:
                buf.append(line)

            save_state(idx)
            await asyncio.sleep(1)

            if time.time()-last > 60:
                with open(OUTPUT_FILE,"a",encoding="utf-8") as f:
                    f.write("\n".join(buf)+"\n")
                log(f"SAVE_BATCH | {len(buf)}")
                buf.clear()
                last=time.time()

    if buf:
        with open(OUTPUT_FILE,"a",encoding="utf-8") as f:
            f.write("\n".join(buf)+"\n")
        log(f"FINAL_SAVE | {len(buf)}")

# ---------- DAEMON ----------
def start():
    if os.path.exists(PID_FILE):
        print("běží")
        return

    pid=os.fork()
    if pid>0:
        open(PID_FILE,"w").write(str(pid))
        print("start")
        return

    signal.signal(signal.SIGTERM, lambda s,f: sys.exit(0))
    asyncio.run(worker())

def stop():
    if not os.path.exists(PID_FILE):
        print("neběží")
        return
    pid=int(open(PID_FILE).read())
    os.kill(pid,signal.SIGTERM)
    os.remove(PID_FILE)
    print("stop")

if __name__=="__main__":
    if "--stop" in sys.argv:
        stop()
    else:
        start()