#!/usr/bin/env python3 import argparse import time import sys import logging from collections import deque from urllib.parse import urljoin, urlsplit, urlunsplit import requests from bs4 import BeautifulSoup from urllib.robotparser import RobotFileParser DEFAULT_UA = "abaj-ai-crawler/0.1 (+https://abaj.ai)" def normalize_url(url: str) -> str: """ Normalize URL by: - removing fragments - stripping trailing slashes (except root) - removing query params """ parts = urlsplit(url) # drop query and fragment path = parts.path or "/" if path != "/" and path.endswith("/"): path = path.rstrip("/") new_parts = (parts.scheme, parts.netloc, path, "", "") return urlunsplit(new_parts) def get_robots(base_url: str, user_agent: str) -> RobotFileParser | None: robots_url = urljoin(base_url, "/robots.txt") rp = RobotFileParser() rp.set_url(robots_url) try: rp.read() return rp except Exception as e: logging.warning("Could not read robots.txt at %s: %s", robots_url, e) return None def count_from_sitemap(base_url: str, user_agent: str, timeout: float = 10.0) -> int | None: """Try to count pages via sitemap.xml; returns None if not available.""" sitemap_url = urljoin(base_url, "/sitemap.xml") try: resp = requests.get(sitemap_url, headers={"User-Agent": user_agent}, timeout=timeout) except Exception as e: logging.info("No sitemap (error fetching %s): %s", sitemap_url, e) return None if resp.status_code != 200: logging.info("No sitemap (status %s from %s)", resp.status_code, sitemap_url) return None content_type = resp.headers.get("Content-Type", "") if "xml" not in content_type and "text" not in content_type: logging.info("Sitemap at %s did not look like XML (Content-Type %r)", sitemap_url, content_type) return None from xml.etree import ElementTree as ET try: tree = ET.fromstring(resp.content) except ET.ParseError as e: logging.info("Could not parse sitemap XML at %s: %s", sitemap_url, e) return None # Sitemap namespace might be present, handle both with/without loc_tags = [] if tree.tag.endswith("sitemapindex"): # sitemap index; sum all sitemaps for sm in tree.iter(): if sm.tag.endswith("loc"): loc_tags.append(sm.text.strip()) logging.info("Sitemap index found with %d child sitemaps", len(loc_tags)) # We won't fetch nested sitemaps here to keep it lightweight. # Instead, caller can choose to crawl if they want more detail. return None for loc in tree.iter(): if loc.tag.endswith("loc") and loc.text: loc_tags.append(loc.text.strip()) logging.info("Sitemap at %s lists %d URLs", sitemap_url, len(loc_tags)) return len(loc_tags) def crawl_site( base_url: str, delay: float = 1.0, max_pages: int = 500, user_agent: str = DEFAULT_UA, ) -> int: """Polite BFS crawl restricted to same host, counting unique HTML pages.""" parsed_root = urlsplit(base_url) root_host = parsed_root.netloc root_scheme = parsed_root.scheme rp = get_robots(base_url, user_agent) seen: set[str] = set() queue: deque[str] = deque([normalize_url(base_url)]) last_request_time = 0.0 session = requests.Session() session.headers.update({"User-Agent": user_agent}) non_html = 0 errors = 0 while queue and len(seen) < max_pages: url = queue.popleft() if url in seen: continue seen.add(url) if rp and not rp.can_fetch(user_agent, url): logging.debug("Disallowed by robots.txt: %s", url) continue # politeness delay elapsed = time.time() - last_request_time if elapsed < delay: time.sleep(delay - elapsed) try: resp = session.get(url, timeout=10) last_request_time = time.time() except Exception as e: errors += 1 logging.warning("Error fetching %s: %s", url, e) continue if resp.status_code != 200: logging.debug("Non-200 (%s) at %s", resp.status_code, url) continue content_type = resp.headers.get("Content-Type", "") if "text/html" not in content_type: non_html += 1 continue soup = BeautifulSoup(resp.text, "html.parser") for a in soup.find_all("a", href=True): href = a["href"].strip() # skip mailto:, tel:, javascript: etc. if any(href.lower().startswith(prefix) for prefix in ("mailto:", "tel:", "javascript:")): continue new_url = urljoin(url, href) parts = urlsplit(new_url) # stay on same host and scheme if parts.netloc != root_host or parts.scheme != root_scheme: continue norm = normalize_url(new_url) if norm not in seen: queue.append(norm) logging.info( "Crawl finished: %d HTML pages, %d non-HTML URLs, %d errors (max_pages=%d)", len(seen), non_html, errors, max_pages, ) return len(seen) def main(): parser = argparse.ArgumentParser( description="Politely count how many pages are on a site (default: https://abaj.ai)." ) parser.add_argument( "base_url", nargs="?", default="https://abaj.ai", help="Base URL to crawl (default: https://abaj.ai)", ) parser.add_argument( "--delay", type=float, default=1.0, help="Delay between requests in seconds (default: 1.0)", ) parser.add_argument( "--max-pages", type=int, default=300, help="Safety cap on number of pages to crawl (default: 300)", ) parser.add_argument( "--no-sitemap", action="store_true", help="Skip sitemap.xml shortcut and do a full crawl", ) parser.add_argument( "--verbose", "-v", action="count", default=0, help="Increase verbosity (use -vv for debug)", ) args = parser.parse_args() log_level = logging.WARNING if args.verbose == 1: log_level = logging.INFO elif args.verbose >= 2: log_level = logging.DEBUG logging.basicConfig(level=log_level, format="%(levelname)s: %(message)s") base_url = args.base_url.rstrip("/") ua = DEFAULT_UA if not args.no_sitemap: sitemap_count = count_from_sitemap(base_url, ua) if sitemap_count is not None: print(f"Sitemap reports {sitemap_count} URLs for {base_url}") sys.exit(0) else: logging.info("Falling back to crawler because sitemap was unavailable/incomplete.") page_count = crawl_site(base_url, delay=args.delay, max_pages=args.max_pages, user_agent=ua) print(f"Crawler visited {page_count} unique HTML pages on {base_url}") if __name__ == "__main__": main()