Tutorial

Crawl API

Multi-Page Crawling

Scrape entire websites with automatic sitemap discovery, depth-based link following, and URL pattern filtering.

Prerequisites

You need an AlterLab API key to use the crawl endpoint. Crawling costs based on the number of pages scraped and the anti-bot tier used for each page.

Common Use Cases

Documentation Sites

Index entire documentation portals for search, RAG pipelines, or offline reference.

Product Catalogs

Crawl e-commerce category pages to discover and scrape every product listing.

Blog Archives

Extract all articles from a blog for content analysis, migration, or archival.

Knowledge Bases

Scrape help centers and wikis to build training data for AI models or chatbots.

Start a Crawl

The crawl endpoint discovers pages via sitemap.xml and enqueues them for scraping. If no sitemap is found, it falls back to link extraction from the start URL.

Python

import requests
import time

API_KEY = "YOUR_API_KEY"
BASE_URL = "https://api.alterlab.io/api/v1"

def start_crawl(url, max_pages=50, max_depth=3):
    """Start a website crawl with sitemap discovery."""
    response = requests.post(
        f"{BASE_URL}/crawl",
        headers={"X-API-Key": API_KEY},
        json={
            "url": url,
            "max_pages": max_pages,
            "max_depth": max_depth,
            "respect_robots": True
        }
    )

    data = response.json()

    if response.status_code == 202:
        print(f"Crawl started: {data['crawl_id']}")
        print(f"Pages discovered: {data['estimated_pages']}")
        print(f"Cost estimated: {data['estimated_credits']}")
        return data["crawl_id"]

    print(f"Error: {data}")
    return None

# Crawl a documentation site
crawl_id = start_crawl("https://docs.example.com", max_pages=100)

How Discovery Works

AlterLab tries these discovery methods in order: (1) sitemap.xml at the domain root, (2) sitemap_index.xml for sites with multiple sitemaps, (3) link extraction from the start URL as a fallback. Pages are then enqueued for scraping with automatic deduplication.

Depth & URL Filters

Control exactly which pages get scraped using depth limits and glob patterns. This is especially useful for large sites where you only need specific sections.

Python

def crawl_with_filters(url):
    """Crawl with depth limits and URL pattern filters."""
    response = requests.post(
        f"{BASE_URL}/crawl",
        headers={"X-API-Key": API_KEY},
        json={
            "url": url,
            "max_pages": 200,
            "max_depth": 2,

            # Only scrape blog posts and docs
            "include_patterns": [
                "/blog/*",
                "/docs/*",
                "/guides/*"
            ],

            # Skip admin pages, assets, and feeds
            "exclude_patterns": [
                "/admin/*",
                "/assets/*",
                "*.xml",
                "*.json",
                "/tag/*",
                "/author/*"
            ],

            # Include subdomains (e.g., blog.example.com)
            "include_subdomains": False,

            # Output as markdown for LLM consumption
            "formats": ["markdown", "text"]
        }
    )

    return response.json()

# Crawl only the blog section of a site
result = crawl_with_filters("https://example.com")
print(f"Crawl ID: {result.get('crawl_id')}")
print(f"Pages to scrape: {result.get('estimated_pages')}")

Depth Explained

max_depth: 0 scrapes only the start URL. max_depth: 1 scrapes the start URL plus any pages linked from it. max_depth: 3 (default) follows links up to 3 hops from the start URL. Higher depth values discover more pages but cost more.

Poll Crawl Progress

Crawls run asynchronously. Poll the status endpoint to track progress and detect completion.

Python

def wait_for_crawl(crawl_id, poll_interval=5):
    """Poll crawl status until completion."""
    while True:
        response = requests.get(
            f"{BASE_URL}/crawl/{crawl_id}",
            headers={"X-API-Key": API_KEY}
        )
        status = response.json()

        completed = status["completed"]
        total = status["total"]
        failed = status["failed"]
        crawl_status = status["status"]
        depth = status.get("current_depth", 0)
        max_depth = status.get("max_depth", 0)

        print(
            f"[{crawl_status}] "
            f"{completed}/{total} pages done, "
            f"{failed} failed, "
            f"depth {depth}/{max_depth}"
        )

        if crawl_status in ("completed", "partial", "failed"):
            return status

        time.sleep(poll_interval)

# Wait for the crawl to finish
final_status = wait_for_crawl(crawl_id)
print(f"\nCrawl {final_status['status']}")
print(f"Total pages: {final_status['total']}")
print(f"Completed: {final_status['completed']}")
print(f"Failed: {final_status['failed']}")

Crawl Status Values

discoveringFinding pages via sitemap/links

scrapingPages are being scraped

completedAll pages finished successfully

partialSome pages failed

failedCrawl encountered a fatal error

cancelledCrawl was manually cancelled

Retrieve Results

Once the crawl completes, fetch results with include_results=true to get the scraped content for each page.

Python

def get_crawl_results(crawl_id):
    """Retrieve all crawl results with page content."""
    response = requests.get(
        f"{BASE_URL}/crawl/{crawl_id}",
        headers={"X-API-Key": API_KEY},
        params={"include_results": "true"}
    )

    data = response.json()

    if data.get("pages"):
        for page in data["pages"]:
            url = page["url"]
            status = page["status"]

            if status == "succeeded" and page.get("result"):
                content = page["result"]
                print(f"[OK] {url}")
                print(f"  Content length: {len(content.get('text', ''))}")

                # Access different formats
                text = content.get("text", "")
                markdown = content.get("markdown", "")
                html = content.get("html", "")

                # Access structured extraction (if schema was provided)
                extracted = content.get("filtered_content")
                if extracted:
                    print(f"  Extracted: {extracted}")
            else:
                error = page.get("error", "Unknown error")
                print(f"[FAIL] {url}: {error}")

    # Billing summary
    billing = data.get("billing", {})
    if billing:
        print(f"\nCredits used: {billing.get('credits_used', 0)}")
        print(f"Actual cost: {billing.get('actual_cost_usd', '$0.00')}")
        print(f"Tier breakdown: {billing.get('tier_breakdown', {})}")

    return data

results = get_crawl_results(crawl_id)

Structured Extraction During Crawl

Add extraction_schema or extraction_profile to your crawl request and every page will be processed through AlterLab's AI extraction pipeline. This is useful for pulling structured data (titles, dates, authors) from blog archives or product details from catalog pages.

Handling Large Sites

For sites with thousands of pages, use cost controls and pagination to manage scope and budget.

Python

def crawl_large_site(url, budget_credits=500):
    """Crawl a large site with cost controls."""
    response = requests.post(
        f"{BASE_URL}/crawl",
        headers={"X-API-Key": API_KEY},
        json={
            "url": url,
            "max_pages": 1000,
            "max_depth": 5,

            # Cost controls to stay within budget
            "cost_controls": {
                "max_credits": budget_credits,
                "max_tier": "3"  # Avoid expensive tier 4
            },

            # Focus on content pages
            "include_patterns": ["/docs/*", "/blog/*", "/help/*"],
            "exclude_patterns": [
                "/api/*", "/assets/*", "/cdn/*",
                "*.pdf", "*.zip", "*.png"
            ],

            # Get markdown output for LLM pipelines
            "formats": ["markdown"],

            # Get notified when done
            "webhook_url": "https://myapp.com/webhooks/crawl-done"
        }
    )

    data = response.json()
    if response.status_code == 202:
        print(f"Crawl {data['crawl_id']} started")
        print(f"Estimated pages: {data['estimated_pages']}")
        print(f"Cost estimated: {data['estimated_credits']}")

        # Credits will be refunded for any unused reservation
        # when the crawl completes
    return data

# Crawl with a 500-credit budget
crawl_large_site("https://docs.example.com", budget_credits=500)

Credit Reservation

Credits are reserved upfront based on estimated page count and tier. When the crawl completes, unused credits are automatically refunded. Use max_credits to cap the total reservation.

Webhook Notifications

Instead of polling, provide a webhook_url and AlterLab will send a crawl.completed event when all pages are processed. This is recommended for large crawls that may take several minutes.

Cancel & Cleanup

Cancel a running crawl to stop processing. Unprocessed pages are automatically refunded.

Python

def cancel_crawl(crawl_id):
    """Cancel a running crawl and get refund details."""
    response = requests.delete(
        f"{BASE_URL}/crawl/{crawl_id}",
        headers={"X-API-Key": API_KEY}
    )

    data = response.json()

    if response.status_code == 200:
        print(f"Crawl cancelled: {data['crawl_id']}")
        print(f"Jobs cancelled: {data['cancelled_jobs']}")
        print(f"Credits refunded: {data['credits_refunded']}")
        print(f"Credits used: {data['credits_used']}")
    else:
        print(f"Error: {data}")

    return data

# Cancel if crawl is taking too long or found enough data
cancel_crawl(crawl_id)

Partial Results

After cancellation, pages that were already being processed may still complete. Use GET /v1/crawl/{crawl_id}?include_results=true to retrieve results for any pages that finished before the cancellation took effect.

Best Practices

Do

Start with a small max_pages to estimate scope before crawling the full site
Use include_patterns to target specific sections instead of crawling everything
Set cost_controls.max_credits to cap spending on large sites
Use webhooks instead of polling for crawls over 100 pages
Keep respect_robots: true (default) to follow robots.txt rules

Don't

Set max_pages: 100000 without cost controls on an unknown site
Use max_depth: 50 unless you specifically need deep link graphs
Enable render_js for all pages unless necessary (increases cost and time)
Run multiple concurrent crawls against the same domain (you are limited to 3 concurrent crawls per account)
Ignore the billing summary in crawl results when optimizing costs

Complete Example

Here is a complete end-to-end example that starts a crawl, waits for completion, and saves the results:

Python

import requests
import time
import json

API_KEY = "YOUR_API_KEY"
BASE_URL = "https://api.alterlab.io/api/v1"

def crawl_website(url, max_pages=50, max_depth=3, output_file="results.json"):
    """Complete crawl workflow: start, poll, retrieve, save."""

    # 1. Start the crawl
    print(f"Starting crawl of {url}...")
    response = requests.post(
        f"{BASE_URL}/crawl",
        headers={"X-API-Key": API_KEY},
        json={
            "url": url,
            "max_pages": max_pages,
            "max_depth": max_depth,
            "formats": ["markdown", "text"],
            "cost_controls": {"max_tier": "3"}
        }
    )

    if response.status_code != 202:
        print(f"Failed to start crawl: {response.json()}")
        return None

    crawl_id = response.json()["crawl_id"]
    print(f"Crawl ID: {crawl_id}")

    # 2. Poll until complete
    while True:
        status_resp = requests.get(
            f"{BASE_URL}/crawl/{crawl_id}",
            headers={"X-API-Key": API_KEY}
        )
        status = status_resp.json()
        print(
            f"  [{status['status']}] "
            f"{status['completed']}/{status['total']} pages"
        )

        if status["status"] in ("completed", "partial", "failed"):
            break
        time.sleep(5)

    # 3. Retrieve results
    results_resp = requests.get(
        f"{BASE_URL}/crawl/{crawl_id}",
        headers={"X-API-Key": API_KEY},
        params={"include_results": "true"}
    )
    results = results_resp.json()

    # 4. Save to file
    pages = []
    for page in results.get("pages", []):
        if page["status"] == "succeeded" and page.get("result"):
            pages.append({
                "url": page["url"],
                "markdown": page["result"].get("markdown", ""),
                "text": page["result"].get("text", ""),
            })

    with open(output_file, "w") as f:
        json.dump(pages, f, indent=2)

    billing = results.get("billing", {})
    print(f"\nDone! {len(pages)} pages saved to {output_file}")
    print(f"Cost: {billing.get('actual_cost_usd', 'N/A')}")
    return results

# Usage
crawl_website("https://docs.example.com", max_pages=100)

Price Monitoring Monitoring Dashboard

Last updated: June 2026

Tutorial

Crawl API

Multi-Page Crawling

Scrape entire websites with automatic sitemap discovery, depth-based link following, and URL pattern filtering.

Prerequisites

You need an AlterLab API key to use the crawl endpoint. Crawling costs based on the number of pages scraped and the anti-bot tier used for each page.

Common Use Cases

Documentation Sites

Index entire documentation portals for search, RAG pipelines, or offline reference.

Product Catalogs

Crawl e-commerce category pages to discover and scrape every product listing.

Blog Archives

Extract all articles from a blog for content analysis, migration, or archival.

Knowledge Bases

Scrape help centers and wikis to build training data for AI models or chatbots.

Start a Crawl

The crawl endpoint discovers pages via sitemap.xml and enqueues them for scraping. If no sitemap is found, it falls back to link extraction from the start URL.

Python

import requests
import time

API_KEY = "YOUR_API_KEY"
BASE_URL = "https://api.alterlab.io/api/v1"

def start_crawl(url, max_pages=50, max_depth=3):
    """Start a website crawl with sitemap discovery."""
    response = requests.post(
        f"{BASE_URL}/crawl",
        headers={"X-API-Key": API_KEY},
        json={
            "url": url,
            "max_pages": max_pages,
            "max_depth": max_depth,
            "respect_robots": True
        }
    )

    data = response.json()

    if response.status_code == 202:
        print(f"Crawl started: {data['crawl_id']}")
        print(f"Pages discovered: {data['estimated_pages']}")
        print(f"Cost estimated: {data['estimated_credits']}")
        return data["crawl_id"]

    print(f"Error: {data}")
    return None

# Crawl a documentation site
crawl_id = start_crawl("https://docs.example.com", max_pages=100)

How Discovery Works

Depth & URL Filters

Control exactly which pages get scraped using depth limits and glob patterns. This is especially useful for large sites where you only need specific sections.

Python

def crawl_with_filters(url):
    """Crawl with depth limits and URL pattern filters."""
    response = requests.post(
        f"{BASE_URL}/crawl",
        headers={"X-API-Key": API_KEY},
        json={
            "url": url,
            "max_pages": 200,
            "max_depth": 2,

            # Only scrape blog posts and docs
            "include_patterns": [
                "/blog/*",
                "/docs/*",
                "/guides/*"
            ],

            # Skip admin pages, assets, and feeds
            "exclude_patterns": [
                "/admin/*",
                "/assets/*",
                "*.xml",
                "*.json",
                "/tag/*",
                "/author/*"
            ],

            # Include subdomains (e.g., blog.example.com)
            "include_subdomains": False,

            # Output as markdown for LLM consumption
            "formats": ["markdown", "text"]
        }
    )

    return response.json()

# Crawl only the blog section of a site
result = crawl_with_filters("https://example.com")
print(f"Crawl ID: {result.get('crawl_id')}")
print(f"Pages to scrape: {result.get('estimated_pages')}")

Depth Explained

Poll Crawl Progress

Crawls run asynchronously. Poll the status endpoint to track progress and detect completion.

Python

def wait_for_crawl(crawl_id, poll_interval=5):
    """Poll crawl status until completion."""
    while True:
        response = requests.get(
            f"{BASE_URL}/crawl/{crawl_id}",
            headers={"X-API-Key": API_KEY}
        )
        status = response.json()

        completed = status["completed"]
        total = status["total"]
        failed = status["failed"]
        crawl_status = status["status"]
        depth = status.get("current_depth", 0)
        max_depth = status.get("max_depth", 0)

        print(
            f"[{crawl_status}] "
            f"{completed}/{total} pages done, "
            f"{failed} failed, "
            f"depth {depth}/{max_depth}"
        )

        if crawl_status in ("completed", "partial", "failed"):
            return status

        time.sleep(poll_interval)

# Wait for the crawl to finish
final_status = wait_for_crawl(crawl_id)
print(f"\nCrawl {final_status['status']}")
print(f"Total pages: {final_status['total']}")
print(f"Completed: {final_status['completed']}")
print(f"Failed: {final_status['failed']}")

Crawl Status Values

discoveringFinding pages via sitemap/links

scrapingPages are being scraped

completedAll pages finished successfully

partialSome pages failed

failedCrawl encountered a fatal error

cancelledCrawl was manually cancelled

Retrieve Results

Once the crawl completes, fetch results with include_results=true to get the scraped content for each page.

Python

def get_crawl_results(crawl_id):
    """Retrieve all crawl results with page content."""
    response = requests.get(
        f"{BASE_URL}/crawl/{crawl_id}",
        headers={"X-API-Key": API_KEY},
        params={"include_results": "true"}
    )

    data = response.json()

    if data.get("pages"):
        for page in data["pages"]:
            url = page["url"]
            status = page["status"]

            if status == "succeeded" and page.get("result"):
                content = page["result"]
                print(f"[OK] {url}")
                print(f"  Content length: {len(content.get('text', ''))}")

                # Access different formats
                text = content.get("text", "")
                markdown = content.get("markdown", "")
                html = content.get("html", "")

                # Access structured extraction (if schema was provided)
                extracted = content.get("filtered_content")
                if extracted:
                    print(f"  Extracted: {extracted}")
            else:
                error = page.get("error", "Unknown error")
                print(f"[FAIL] {url}: {error}")

    # Billing summary
    billing = data.get("billing", {})
    if billing:
        print(f"\nCredits used: {billing.get('credits_used', 0)}")
        print(f"Actual cost: {billing.get('actual_cost_usd', '$0.00')}")
        print(f"Tier breakdown: {billing.get('tier_breakdown', {})}")

    return data

results = get_crawl_results(crawl_id)

Structured Extraction During Crawl

Handling Large Sites

For sites with thousands of pages, use cost controls and pagination to manage scope and budget.

Python

def crawl_large_site(url, budget_credits=500):
    """Crawl a large site with cost controls."""
    response = requests.post(
        f"{BASE_URL}/crawl",
        headers={"X-API-Key": API_KEY},
        json={
            "url": url,
            "max_pages": 1000,
            "max_depth": 5,

            # Cost controls to stay within budget
            "cost_controls": {
                "max_credits": budget_credits,
                "max_tier": "3"  # Avoid expensive tier 4
            },

            # Focus on content pages
            "include_patterns": ["/docs/*", "/blog/*", "/help/*"],
            "exclude_patterns": [
                "/api/*", "/assets/*", "/cdn/*",
                "*.pdf", "*.zip", "*.png"
            ],

            # Get markdown output for LLM pipelines
            "formats": ["markdown"],

            # Get notified when done
            "webhook_url": "https://myapp.com/webhooks/crawl-done"
        }
    )

    data = response.json()
    if response.status_code == 202:
        print(f"Crawl {data['crawl_id']} started")
        print(f"Estimated pages: {data['estimated_pages']}")
        print(f"Cost estimated: {data['estimated_credits']}")

        # Credits will be refunded for any unused reservation
        # when the crawl completes
    return data

# Crawl with a 500-credit budget
crawl_large_site("https://docs.example.com", budget_credits=500)

Credit Reservation

Credits are reserved upfront based on estimated page count and tier. When the crawl completes, unused credits are automatically refunded. Use max_credits to cap the total reservation.

Webhook Notifications

Instead of polling, provide a webhook_url and AlterLab will send a crawl.completed event when all pages are processed. This is recommended for large crawls that may take several minutes.

Cancel & Cleanup

Cancel a running crawl to stop processing. Unprocessed pages are automatically refunded.

Python

def cancel_crawl(crawl_id):
    """Cancel a running crawl and get refund details."""
    response = requests.delete(
        f"{BASE_URL}/crawl/{crawl_id}",
        headers={"X-API-Key": API_KEY}
    )

    data = response.json()

    if response.status_code == 200:
        print(f"Crawl cancelled: {data['crawl_id']}")
        print(f"Jobs cancelled: {data['cancelled_jobs']}")
        print(f"Credits refunded: {data['credits_refunded']}")
        print(f"Credits used: {data['credits_used']}")
    else:
        print(f"Error: {data}")

    return data

# Cancel if crawl is taking too long or found enough data
cancel_crawl(crawl_id)

Partial Results

Best Practices

Do

Start with a small max_pages to estimate scope before crawling the full site
Use include_patterns to target specific sections instead of crawling everything
Set cost_controls.max_credits to cap spending on large sites
Use webhooks instead of polling for crawls over 100 pages
Keep respect_robots: true (default) to follow robots.txt rules

Don't

Set max_pages: 100000 without cost controls on an unknown site
Use max_depth: 50 unless you specifically need deep link graphs
Enable render_js for all pages unless necessary (increases cost and time)
Run multiple concurrent crawls against the same domain (you are limited to 3 concurrent crawls per account)
Ignore the billing summary in crawl results when optimizing costs

Complete Example

Here is a complete end-to-end example that starts a crawl, waits for completion, and saves the results:

Python

import requests
import time
import json

API_KEY = "YOUR_API_KEY"
BASE_URL = "https://api.alterlab.io/api/v1"

def crawl_website(url, max_pages=50, max_depth=3, output_file="results.json"):
    """Complete crawl workflow: start, poll, retrieve, save."""

    # 1. Start the crawl
    print(f"Starting crawl of {url}...")
    response = requests.post(
        f"{BASE_URL}/crawl",
        headers={"X-API-Key": API_KEY},
        json={
            "url": url,
            "max_pages": max_pages,
            "max_depth": max_depth,
            "formats": ["markdown", "text"],
            "cost_controls": {"max_tier": "3"}
        }
    )

    if response.status_code != 202:
        print(f"Failed to start crawl: {response.json()}")
        return None

    crawl_id = response.json()["crawl_id"]
    print(f"Crawl ID: {crawl_id}")

    # 2. Poll until complete
    while True:
        status_resp = requests.get(
            f"{BASE_URL}/crawl/{crawl_id}",
            headers={"X-API-Key": API_KEY}
        )
        status = status_resp.json()
        print(
            f"  [{status['status']}] "
            f"{status['completed']}/{status['total']} pages"
        )

        if status["status"] in ("completed", "partial", "failed"):
            break
        time.sleep(5)

    # 3. Retrieve results
    results_resp = requests.get(
        f"{BASE_URL}/crawl/{crawl_id}",
        headers={"X-API-Key": API_KEY},
        params={"include_results": "true"}
    )
    results = results_resp.json()

    # 4. Save to file
    pages = []
    for page in results.get("pages", []):
        if page["status"] == "succeeded" and page.get("result"):
            pages.append({
                "url": page["url"],
                "markdown": page["result"].get("markdown", ""),
                "text": page["result"].get("text", ""),
            })

    with open(output_file, "w") as f:
        json.dump(pages, f, indent=2)

    billing = results.get("billing", {})
    print(f"\nDone! {len(pages)} pages saved to {output_file}")
    print(f"Cost: {billing.get('actual_cost_usd', 'N/A')}")
    return results

# Usage
crawl_website("https://docs.example.com", max_pages=100)

Price Monitoring Monitoring Dashboard

Last updated: June 2026