Multi-Page Crawling
Scrape entire websites with automatic sitemap discovery, depth-based link following, and URL pattern filtering.
Prerequisites
Common Use Cases
Documentation Sites
Index entire documentation portals for search, RAG pipelines, or offline reference.
Product Catalogs
Crawl e-commerce category pages to discover and scrape every product listing.
Blog Archives
Extract all articles from a blog for content analysis, migration, or archival.
Knowledge Bases
Scrape help centers and wikis to build training data for AI models or chatbots.
Start a Crawl
The crawl endpoint discovers pages via sitemap.xml and enqueues them for scraping. If no sitemap is found, it falls back to link extraction from the start URL.
import requests
import time
API_KEY = "YOUR_API_KEY"
BASE_URL = "https://api.alterlab.io/api/v1"
def start_crawl(url, max_pages=50, max_depth=3):
"""Start a website crawl with sitemap discovery."""
response = requests.post(
f"{BASE_URL}/crawl",
headers={"X-API-Key": API_KEY},
json={
"url": url,
"max_pages": max_pages,
"max_depth": max_depth,
"respect_robots": True
}
)
data = response.json()
if response.status_code == 202:
print(f"Crawl started: {data['crawl_id']}")
print(f"Pages discovered: {data['estimated_pages']}")
print(f"Credits reserved: {data['estimated_credits']}")
return data["crawl_id"]
print(f"Error: {data}")
return None
# Crawl a documentation site
crawl_id = start_crawl("https://docs.example.com", max_pages=100)How Discovery Works
AlterLab tries these discovery methods in order: (1) sitemap.xml at the domain root, (2) sitemap_index.xml for sites with multiple sitemaps, (3) link extraction from the start URL as a fallback. Pages are then enqueued for scraping with automatic deduplication.
Depth & URL Filters
Control exactly which pages get scraped using depth limits and glob patterns. This is especially useful for large sites where you only need specific sections.
def crawl_with_filters(url):
"""Crawl with depth limits and URL pattern filters."""
response = requests.post(
f"{BASE_URL}/crawl",
headers={"X-API-Key": API_KEY},
json={
"url": url,
"max_pages": 200,
"max_depth": 2,
# Only scrape blog posts and docs
"include_patterns": [
"/blog/*",
"/docs/*",
"/guides/*"
],
# Skip admin pages, assets, and feeds
"exclude_patterns": [
"/admin/*",
"/assets/*",
"*.xml",
"*.json",
"/tag/*",
"/author/*"
],
# Include subdomains (e.g., blog.example.com)
"include_subdomains": False,
# Output as markdown for LLM consumption
"formats": ["markdown", "text"]
}
)
return response.json()
# Crawl only the blog section of a site
result = crawl_with_filters("https://example.com")
print(f"Crawl ID: {result.get('crawl_id')}")
print(f"Pages to scrape: {result.get('estimated_pages')}")Depth Explained
max_depth: 0 scrapes only the start URL. max_depth: 1 scrapes the start URL plus any pages linked from it. max_depth: 3 (default) follows links up to 3 hops from the start URL. Higher depth values discover more pages but use more credits.Poll Crawl Progress
Crawls run asynchronously. Poll the status endpoint to track progress and detect completion.
def wait_for_crawl(crawl_id, poll_interval=5):
"""Poll crawl status until completion."""
while True:
response = requests.get(
f"{BASE_URL}/crawl/{crawl_id}",
headers={"X-API-Key": API_KEY}
)
status = response.json()
completed = status["completed"]
total = status["total"]
failed = status["failed"]
crawl_status = status["status"]
depth = status.get("current_depth", 0)
max_depth = status.get("max_depth", 0)
print(
f"[{crawl_status}] "
f"{completed}/{total} pages done, "
f"{failed} failed, "
f"depth {depth}/{max_depth}"
)
if crawl_status in ("completed", "partial", "failed"):
return status
time.sleep(poll_interval)
# Wait for the crawl to finish
final_status = wait_for_crawl(crawl_id)
print(f"\nCrawl {final_status['status']}")
print(f"Total pages: {final_status['total']}")
print(f"Completed: {final_status['completed']}")
print(f"Failed: {final_status['failed']}")Crawl Status Values
discoveringFinding pages via sitemap/linksscrapingPages are being scrapedcompletedAll pages finished successfullypartialSome pages failedfailedCrawl encountered a fatal errorcancelledCrawl was manually cancelledRetrieve Results
Once the crawl completes, fetch results with include_results=true to get the scraped content for each page.
def get_crawl_results(crawl_id):
"""Retrieve all crawl results with page content."""
response = requests.get(
f"{BASE_URL}/crawl/{crawl_id}",
headers={"X-API-Key": API_KEY},
params={"include_results": "true"}
)
data = response.json()
if data.get("pages"):
for page in data["pages"]:
url = page["url"]
status = page["status"]
if status == "succeeded" and page.get("result"):
content = page["result"]
print(f"[OK] {url}")
print(f" Content length: {len(content.get('text', ''))}")
# Access different formats
text = content.get("text", "")
markdown = content.get("markdown", "")
html = content.get("html", "")
# Access structured extraction (if schema was provided)
extracted = content.get("filtered_content")
if extracted:
print(f" Extracted: {extracted}")
else:
error = page.get("error", "Unknown error")
print(f"[FAIL] {url}: {error}")
# Billing summary
billing = data.get("billing", {})
if billing:
print(f"\nCredits used: {billing.get('credits_used', 0)}")
print(f"Actual cost: {billing.get('actual_cost_usd', '$0.00')}")
print(f"Tier breakdown: {billing.get('tier_breakdown', {})}")
return data
results = get_crawl_results(crawl_id)Structured Extraction During Crawl
extraction_schema or extraction_profile to your crawl request and every page will be processed through AlterLab's AI extraction pipeline. This is useful for pulling structured data (titles, dates, authors) from blog archives or product details from catalog pages.Handling Large Sites
For sites with thousands of pages, use cost controls and pagination to manage scope and budget.
def crawl_large_site(url, budget_credits=500):
"""Crawl a large site with cost controls."""
response = requests.post(
f"{BASE_URL}/crawl",
headers={"X-API-Key": API_KEY},
json={
"url": url,
"max_pages": 1000,
"max_depth": 5,
# Cost controls to stay within budget
"cost_controls": {
"max_credits": budget_credits,
"max_tier": "3" # Avoid expensive tier 4
},
# Focus on content pages
"include_patterns": ["/docs/*", "/blog/*", "/help/*"],
"exclude_patterns": [
"/api/*", "/assets/*", "/cdn/*",
"*.pdf", "*.zip", "*.png"
],
# Get markdown output for LLM pipelines
"formats": ["markdown"],
# Get notified when done
"webhook_url": "https://myapp.com/webhooks/crawl-done"
}
)
data = response.json()
if response.status_code == 202:
print(f"Crawl {data['crawl_id']} started")
print(f"Estimated pages: {data['estimated_pages']}")
print(f"Credits reserved: {data['estimated_credits']}")
# Credits will be refunded for any unused reservation
# when the crawl completes
return data
# Crawl with a 500-credit budget
crawl_large_site("https://docs.example.com", budget_credits=500)Credit Reservation
Credits are reserved upfront based on estimated page count and tier. When the crawl completes, unused credits are automatically refunded. Use max_credits to cap the total reservation.
Webhook Notifications
Instead of polling, provide a webhook_url and AlterLab will send a crawl.completed event when all pages are processed. This is recommended for large crawls that may take several minutes.
Cancel & Cleanup
Cancel a running crawl to stop processing. Unprocessed pages are automatically refunded.
def cancel_crawl(crawl_id):
"""Cancel a running crawl and get refund details."""
response = requests.delete(
f"{BASE_URL}/crawl/{crawl_id}",
headers={"X-API-Key": API_KEY}
)
data = response.json()
if response.status_code == 200:
print(f"Crawl cancelled: {data['crawl_id']}")
print(f"Jobs cancelled: {data['cancelled_jobs']}")
print(f"Credits refunded: {data['credits_refunded']}")
print(f"Credits used: {data['credits_used']}")
else:
print(f"Error: {data}")
return data
# Cancel if crawl is taking too long or found enough data
cancel_crawl(crawl_id)Partial Results
GET /v1/crawl/{crawl_id}?include_results=true to retrieve results for any pages that finished before the cancellation took effect.Best Practices
Do
- Start with a small
max_pagesto estimate scope before crawling the full site - Use
include_patternsto target specific sections instead of crawling everything - Set
cost_controls.max_creditsto cap spending on large sites - Use webhooks instead of polling for crawls over 100 pages
- Keep
respect_robots: true(default) to follow robots.txt rules
Don't
- Set
max_pages: 100000without cost controls on an unknown site - Use
max_depth: 50unless you specifically need deep link graphs - Enable
render_jsfor all pages unless necessary (increases cost and time) - Run multiple concurrent crawls against the same domain (you are limited to 3 concurrent crawls per account)
- Ignore the billing summary in crawl results when optimizing costs
Complete Example
Here is a complete end-to-end example that starts a crawl, waits for completion, and saves the results:
import requests
import time
import json
API_KEY = "YOUR_API_KEY"
BASE_URL = "https://api.alterlab.io/api/v1"
def crawl_website(url, max_pages=50, max_depth=3, output_file="results.json"):
"""Complete crawl workflow: start, poll, retrieve, save."""
# 1. Start the crawl
print(f"Starting crawl of {url}...")
response = requests.post(
f"{BASE_URL}/crawl",
headers={"X-API-Key": API_KEY},
json={
"url": url,
"max_pages": max_pages,
"max_depth": max_depth,
"formats": ["markdown", "text"],
"cost_controls": {"max_tier": "3"}
}
)
if response.status_code != 202:
print(f"Failed to start crawl: {response.json()}")
return None
crawl_id = response.json()["crawl_id"]
print(f"Crawl ID: {crawl_id}")
# 2. Poll until complete
while True:
status_resp = requests.get(
f"{BASE_URL}/crawl/{crawl_id}",
headers={"X-API-Key": API_KEY}
)
status = status_resp.json()
print(
f" [{status['status']}] "
f"{status['completed']}/{status['total']} pages"
)
if status["status"] in ("completed", "partial", "failed"):
break
time.sleep(5)
# 3. Retrieve results
results_resp = requests.get(
f"{BASE_URL}/crawl/{crawl_id}",
headers={"X-API-Key": API_KEY},
params={"include_results": "true"}
)
results = results_resp.json()
# 4. Save to file
pages = []
for page in results.get("pages", []):
if page["status"] == "succeeded" and page.get("result"):
pages.append({
"url": page["url"],
"markdown": page["result"].get("markdown", ""),
"text": page["result"].get("text", ""),
})
with open(output_file, "w") as f:
json.dump(pages, f, indent=2)
billing = results.get("billing", {})
print(f"\nDone! {len(pages)} pages saved to {output_file}")
print(f"Cost: {billing.get('actual_cost_usd', 'N/A')}")
return results
# Usage
crawl_website("https://docs.example.com", max_pages=100)