import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def crawl_website(start_url, max_pages=100):
    visited_urls = set()
    urls_to_visit = [start_url]
    base_domain = urlparse(start_url).netloc

    while urls_to_visit and len(visited_urls) < max_pages:
        url = urls_to_visit.pop(0)
        if url in visited_urls:
            continue

        try:
            response = requests.get(url)
            if response.status_code == 200:
                visited_urls.add(url)
                soup = BeautifulSoup(response.text, 'html.parser')
                for link in soup.find_all('a', href=True):
                    full_url = urljoin(start_url, link['href'])
                    # Pastikan URL masih dalam domain yang sama
                    if urlparse(full_url).netloc == base_domain and full_url not in visited_urls:
                        urls_to_visit.append(full_url)
        except Exception as e:
            print(f"Gagal mengakses {url}: {e}")

    return list(visited_urls)

# Contoh penggunaan
start_url = 'https://www.example.com'
all_urls = crawl_website(start_url)
print(f'Ditemukan {len(all_urls)} URL.')