(.*?)

#!/usr/bin/env python3
import os
import re
import sys
import time
import json
import signal
import asyncio
import aiohttp
from urllib.parse import urljoin, urlparse

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DELETE_DIR = os.path.join(BASE_DIR, "delete")
os.makedirs(DELETE_DIR, exist_ok=True)

INPUT_FILE = os.path.join(BASE_DIR, "urls.txt")
OUTPUT_FILE = os.path.join(BASE_DIR, "vysledky.txt")
STATE_FILE = os.path.join(DELETE_DIR, "state.json")
LOG_FILE = os.path.join(DELETE_DIR, "log.txt")
PID_FILE = os.path.join(DELETE_DIR, "daemon.pid")

# ---------- SETTINGS ----------
CONTACT_KW = ["contact", "kontakt", "impressum"]   # jediné KW pro match i fallback

HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

REQUEST_TIMEOUT = 8
REQUEST_TIMEOUT_SSL = 12
SSL_FALLBACK = True

BAD_DOMAIN_PARTS = ["zugradar", "geoview", "wikipedia", "booking", "map"]

AGGREGATOR_MARKERS = [
    "top fachbetriebe",
    "alle firmen",
    "verzeichnis",
    "angebote vergleichen",
    "bewertungen",
    "hotelübersicht"
]

ERROR_PAGE_MARKERS = [
    "just a moment",
    "attention required",
    "page not found",
    "seite nicht gefunden",
    "404 not found"
]

BAD_EMAIL_PARTS = [
    "mysite.com",
    "example.com",
    "example.org",
    "kontakt_de??",
    "noreply",
    "no-reply"
]

SKIP_EMAIL_SUFFIXES = (".jpg", ".png", ".jpeg", ".gif", ".svg")
SKIP_LINK_SUFFIXES = (".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf", ".xml")

PHONE_MIN_DIGITS = 9
PHONE_MAX_DIGITS = 13
PHONE_MAX_COUNT = 5
PHONE_MAX_DASHES = 2
PHONE_MIN_UNIQUE_DIGITS = 3

CRAWL_DELAY_SECONDS = 0
SAVE_EVERY_SECONDS = 60
MAX_CONTACT_LINKS = 3

# fallback jen když homepage nic nenašla
USE_FALLBACK_ONLY_IF_NO_CONTACTS_FOUND = True

EMAIL_REGEX = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]+")
PHONE_REGEX = re.compile(r"(?:\+?\d{1,3}[\s.-]?)?(?:\(?\d{2,4}\)?[\s.-]?){2,4}\d{2,4}")

# ---------- LOG ----------
def log(msg):
    with open(LOG_FILE, "a", encoding="utf-8") as f:
        f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} | {msg}\n")

# ---------- STATE ----------
def save_state(i):
    with open(STATE_FILE, "w", encoding="utf-8") as f:
        json.dump({"index": i}, f)

def load_state():
    if os.path.exists(STATE_FILE):
        with open(STATE_FILE, "r", encoding="utf-8") as f:
            return json.load(f).get("index", 0)
    return 0

# ---------- HTML CLEAN ----------
def clean_html(html):
    html = re.sub(r"<script.*?>.*?</script>", " ", html, flags=re.S | re.I)
    html = re.sub(r"<style.*?>.*?</style>", " ", html, flags=re.S | re.I)
    html = re.sub(r"<svg.*?>.*?</svg>", " ", html, flags=re.S | re.I)
    return html

# ---------- META ----------
def extract_meta(html):
    title = re.search(r"<title>(.*?)</title>", html, re.I | re.S)
    desc = re.search(r'name=["\']description["\'][^>]+content=["\'](.*?)["\']', html, re.I | re.S)
    kw = re.search(r'name=["\']keywords["\'][^>]+content=["\'](.*?)["\']', html, re.I | re.S)
    return (
        title.group(1).strip() if title else "",
        kw.group(1).strip() if kw else "",
        desc.group(1).strip() if desc else ""
    )

# ---------- CONTACT LINKS ----------
def find_contacts(html, base):
    links = re.findall(r'href=["\'](.*?)["\']', html, re.I)
    out = []
    for l in links:
        low = l.lower().strip()
        if not low:
            continue
        if low.startswith(("mailto:", "tel:", "javascript:", "#")):
            continue
        if any(k in low for k in CONTACT_KW):
            if not low.endswith(SKIP_LINK_SUFFIXES):
                out.append(urljoin(base, l))
    return list(dict.fromkeys(out))

# ---------- FALLBACK ----------
def fallback_urls(url):
    p = urlparse(url)
    base = f"{p.scheme}://{p.netloc}"
    return [base + "/" + kw.lstrip("/") for kw in CONTACT_KW]

# ---------- EMAIL ----------
def clean_emails(emails):
    out = set()
    for e in emails:
        e = e.strip()
        if not e:
            continue

        e = e.split("?")[0].strip()
        if not e:
            continue

        el = e.lower()

        if el.endswith(SKIP_EMAIL_SUFFIXES):
            continue

        if any(x in el for x in BAD_EMAIL_PARTS):
            continue

        out.add(e)
    return out

def extract_emails(html):
    emails = set(EMAIL_REGEX.findall(html))

    mails = re.findall(r"mailto:([^\"'>\s]+)", html, re.I)
    emails.update(mails)

    html2 = html.lower().replace(" at ", "@").replace(" dot ", ".")
    emails.update(EMAIL_REGEX.findall(html2))

    return clean_emails(emails)

# ---------- PHONE ----------
def clean_phones(raw):
    out = set()
    for p in raw:
        p = p.strip()
        if not p:
            continue

        if "." in p:
            continue

        if not (p.startswith("+") or p.startswith("0")):
            continue

        if p.count("-") > PHONE_MAX_DASHES:
            continue

        digits = re.sub(r"\D", "", p)

        if len(set(digits)) < PHONE_MIN_UNIQUE_DIGITS:
            continue

        if PHONE_MIN_DIGITS <= len(digits) <= PHONE_MAX_DIGITS:
            out.add(p)

    return list(out)[:PHONE_MAX_COUNT]

def extract_phones(html):
    return clean_phones(set(PHONE_REGEX.findall(html)))

# ---------- FILTER ----------
def bad_domain(url):
    return any(x in url for x in BAD_DOMAIN_PARTS)

def is_aggregator(html):
    text = html.lower()
    score = sum(1 for b in AGGREGATOR_MARKERS if b in text)
    return score >= 2

def is_error_page(html):
    low = html.lower()
    return any(x in low for x in ERROR_PAGE_MARKERS)

# ---------- FETCH ----------
async def fetch(session, url):
    try:
        async with session.get(url, timeout=REQUEST_TIMEOUT) as r:
            return await r.text(errors="ignore")

    except Exception as e:
        log(f"FETCH_FAIL | {url} | {repr(e)}")

        if SSL_FALLBACK:
            try:
                async with session.get(url, timeout=REQUEST_TIMEOUT_SSL, ssl=False) as r:
                    log(f"SSL_FALLBACK_OK | {url}")
                    return await r.text(errors="ignore")
            except Exception as e2:
                log(f"SSL_FALLBACK_FAIL | {url} | {repr(e2)}")

        return ""

# ---------- PROCESS ----------
async def process(session, url):
    log(f"PROCESS_START | {url}")

    if bad_domain(url):
        log(f"SKIP_BAD_DOMAIN | {url}")
        return None

    html = await fetch(session, url)
    if not html:
        return None

    html = clean_html(html)

    if is_error_page(html):
        log(f"SKIP_ERROR_PAGE | {url}")
        return None

    if is_aggregator(html):
        log(f"SKIP_AGGREGATOR | {url}")
        return None

    title, kw, desc = extract_meta(html)
    emails = extract_emails(html)
    phones = extract_phones(html)

    contacts = find_contacts(html, url)

    for c in contacts[:MAX_CONTACT_LINKS]:
        h = await fetch(session, c)
        if h:
            h = clean_html(h)

            if is_error_page(h):
                log(f"SKIP_ERROR_CONTACT | {c}")
                continue

            emails |= extract_emails(h)
            phones += extract_phones(h)

    use_fallback = True
    if USE_FALLBACK_ONLY_IF_NO_CONTACTS_FOUND and (emails or phones):
        use_fallback = False

    if use_fallback:
        for f in fallback_urls(url):
            if f not in contacts:
                h = await fetch(session, f)
                if h:
                    h = clean_html(h)

                    if is_error_page(h):
                        log(f"SKIP_ERROR_FALLBACK | {f}")
                        continue

                    e2 = extract_emails(h)
                    p2 = extract_phones(h)
                    if e2 or p2:
                        log(f"FALLBACK_HIT | {f} | e:{len(e2)} p:{len(p2)}")
                    emails |= e2
                    phones += p2

    phones = list(set(phones))[:PHONE_MAX_COUNT]

    log(f"PROCESS_DONE | {url} | emails:{len(emails)} phones:{len(phones)}")

    return "|".join([
        url,
        title,
        kw,
        desc,
        ",".join(contacts),
        ",".join(emails),
        ",".join(phones)
    ])

# ---------- WORKER ----------
async def worker():
    i = load_state()
    log(f"RESUME_FROM | {i}")

    with open(INPUT_FILE, "r", encoding="utf-8") as f:
        urls = [x.strip() for x in f if x.strip()]

    buf = []
    last = time.time()

    async with aiohttp.ClientSession(headers=HEADERS) as session:
        for idx in range(i, len(urls)):
            url = urls[idx]

            line = await process(session, url)
            if line:
                buf.append(line)

            save_state(idx)
            await asyncio.sleep(CRAWL_DELAY_SECONDS)

            if time.time() - last > SAVE_EVERY_SECONDS:
                with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
                    f.write("\n".join(buf) + ("\n" if buf else ""))
                log(f"SAVE_BATCH | {len(buf)}")
                buf.clear()
                last = time.time()

    if buf:
        with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
            f.write("\n".join(buf) + "\n")
        log(f"FINAL_SAVE | {len(buf)}")

# ---------- DAEMON ----------
def start():
    if os.path.exists(PID_FILE):
        print("běží")
        return

    pid = os.fork()
    if pid > 0:
        with open(PID_FILE, "w", encoding="utf-8") as f:
            f.write(str(pid))
        print("start")
        return

    signal.signal(signal.SIGTERM, lambda s, f: sys.exit(0))
    asyncio.run(worker())

def stop():
    if not os.path.exists(PID_FILE):
        print("neběží")
        return
    with open(PID_FILE, "r", encoding="utf-8") as f:
        pid = int(f.read())
    os.kill(pid, signal.SIGTERM)
    os.remove(PID_FILE)
    print("stop")

if __name__ == "__main__":
    if "--stop" in sys.argv:
        stop()
    else:
        start()