#!/usr/bin/env python3
import argparse
import time
import sys
import logging
from collections import deque
from urllib.parse import urljoin, urlsplit, urlunsplit

import requests
from bs4 import BeautifulSoup
from urllib.robotparser import RobotFileParser


DEFAULT_UA = "abaj-ai-crawler/0.1 (+https://abaj.ai)"


def normalize_url(url: str) -> str:
    """
    Normalize URL by:
    - removing fragments
    - stripping trailing slashes (except root)
    - removing query params
    """
    parts = urlsplit(url)
    # drop query and fragment
    path = parts.path or "/"
    if path != "/" and path.endswith("/"):
        path = path.rstrip("/")
    new_parts = (parts.scheme, parts.netloc, path, "", "")
    return urlunsplit(new_parts)


def get_robots(base_url: str, user_agent: str) -> RobotFileParser | None:
    robots_url = urljoin(base_url, "/robots.txt")
    rp = RobotFileParser()
    rp.set_url(robots_url)
    try:
        rp.read()
        return rp
    except Exception as e:
        logging.warning("Could not read robots.txt at %s: %s", robots_url, e)
        return None


def count_from_sitemap(base_url: str, user_agent: str, timeout: float = 10.0) -> int | None:
    """Try to count pages via sitemap.xml; returns None if not available."""
    sitemap_url = urljoin(base_url, "/sitemap.xml")
    try:
        resp = requests.get(sitemap_url, headers={"User-Agent": user_agent}, timeout=timeout)
    except Exception as e:
        logging.info("No sitemap (error fetching %s): %s", sitemap_url, e)
        return None

    if resp.status_code != 200:
        logging.info("No sitemap (status %s from %s)", resp.status_code, sitemap_url)
        return None

    content_type = resp.headers.get("Content-Type", "")
    if "xml" not in content_type and "text" not in content_type:
        logging.info("Sitemap at %s did not look like XML (Content-Type %r)", sitemap_url, content_type)
        return None

    from xml.etree import ElementTree as ET

    try:
        tree = ET.fromstring(resp.content)
    except ET.ParseError as e:
        logging.info("Could not parse sitemap XML at %s: %s", sitemap_url, e)
        return None

    # Sitemap namespace might be present, handle both with/without
    loc_tags = []
    if tree.tag.endswith("sitemapindex"):
        # sitemap index; sum all sitemaps
        for sm in tree.iter():
            if sm.tag.endswith("loc"):
                loc_tags.append(sm.text.strip())
        logging.info("Sitemap index found with %d child sitemaps", len(loc_tags))
        # We won't fetch nested sitemaps here to keep it lightweight.
        # Instead, caller can choose to crawl if they want more detail.
        return None

    for loc in tree.iter():
        if loc.tag.endswith("loc") and loc.text:
            loc_tags.append(loc.text.strip())
    logging.info("Sitemap at %s lists %d URLs", sitemap_url, len(loc_tags))
    return len(loc_tags)


def crawl_site(
    base_url: str,
    delay: float = 1.0,
    max_pages: int = 500,
    user_agent: str = DEFAULT_UA,
) -> int:
    """Polite BFS crawl restricted to same host, counting unique HTML pages."""
    parsed_root = urlsplit(base_url)
    root_host = parsed_root.netloc
    root_scheme = parsed_root.scheme

    rp = get_robots(base_url, user_agent)

    seen: set[str] = set()
    queue: deque[str] = deque([normalize_url(base_url)])
    last_request_time = 0.0

    session = requests.Session()
    session.headers.update({"User-Agent": user_agent})

    non_html = 0
    errors = 0

    while queue and len(seen) < max_pages:
        url = queue.popleft()
        if url in seen:
            continue
        seen.add(url)

        if rp and not rp.can_fetch(user_agent, url):
            logging.debug("Disallowed by robots.txt: %s", url)
            continue

        # politeness delay
        elapsed = time.time() - last_request_time
        if elapsed < delay:
            time.sleep(delay - elapsed)

        try:
            resp = session.get(url, timeout=10)
            last_request_time = time.time()
        except Exception as e:
            errors += 1
            logging.warning("Error fetching %s: %s", url, e)
            continue

        if resp.status_code != 200:
            logging.debug("Non-200 (%s) at %s", resp.status_code, url)
            continue

        content_type = resp.headers.get("Content-Type", "")
        if "text/html" not in content_type:
            non_html += 1
            continue

        soup = BeautifulSoup(resp.text, "html.parser")

        for a in soup.find_all("a", href=True):
            href = a["href"].strip()

            # skip mailto:, tel:, javascript: etc.
            if any(href.lower().startswith(prefix) for prefix in ("mailto:", "tel:", "javascript:")):
                continue

            new_url = urljoin(url, href)
            parts = urlsplit(new_url)

            # stay on same host and scheme
            if parts.netloc != root_host or parts.scheme != root_scheme:
                continue

            norm = normalize_url(new_url)
            if norm not in seen:
                queue.append(norm)

    logging.info(
        "Crawl finished: %d HTML pages, %d non-HTML URLs, %d errors (max_pages=%d)",
        len(seen), non_html, errors, max_pages,
    )
    return len(seen)


def main():
    parser = argparse.ArgumentParser(
        description="Politely count how many pages are on a site (default: https://abaj.ai)."
    )
    parser.add_argument(
        "base_url",
        nargs="?",
        default="https://abaj.ai",
        help="Base URL to crawl (default: https://abaj.ai)",
    )
    parser.add_argument(
        "--delay",
        type=float,
        default=1.0,
        help="Delay between requests in seconds (default: 1.0)",
    )
    parser.add_argument(
        "--max-pages",
        type=int,
        default=300,
        help="Safety cap on number of pages to crawl (default: 300)",
    )
    parser.add_argument(
        "--no-sitemap",
        action="store_true",
        help="Skip sitemap.xml shortcut and do a full crawl",
    )
    parser.add_argument(
        "--verbose",
        "-v",
        action="count",
        default=0,
        help="Increase verbosity (use -vv for debug)",
    )
    args = parser.parse_args()

    log_level = logging.WARNING
    if args.verbose == 1:
        log_level = logging.INFO
    elif args.verbose >= 2:
        log_level = logging.DEBUG
    logging.basicConfig(level=log_level, format="%(levelname)s: %(message)s")

    base_url = args.base_url.rstrip("/")
    ua = DEFAULT_UA

    if not args.no_sitemap:
        sitemap_count = count_from_sitemap(base_url, ua)
        if sitemap_count is not None:
            print(f"Sitemap reports {sitemap_count} URLs for {base_url}")
            sys.exit(0)
        else:
            logging.info("Falling back to crawler because sitemap was unavailable/incomplete.")

    page_count = crawl_site(base_url, delay=args.delay, max_pages=args.max_pages, user_agent=ua)
    print(f"Crawler visited {page_count} unique HTML pages on {base_url}")


if __name__ == "__main__":
    main()