Integration
AI Framework
LlamaIndex
Use AlterLab as a data connector in LlamaIndex to build RAG pipelines with live web content. Create a custom reader, index pages into vector stores, and query with natural language.
Why AlterLab + LlamaIndex?
LlamaIndex excels at indexing and querying data. AlterLab handles the hard part — getting clean content from any website, including JavaScript-heavy SPAs and anti-bot-protected pages. Together, they make web-powered RAG pipelines simple.
Installation
Bash
pip install alterlab llama-index llama-index-llms-openai llama-index-embeddings-openaiYou need the AlterLab Python SDK and LlamaIndex core. The examples below use OpenAI for LLM and embeddings, but you can substitute any LlamaIndex-compatible provider.
Custom Reader
Create an AlterLab reader that implements the LlamaIndex BaseReader interface. This reader scrapes URLs and converts them into LlamaIndex Document objects.
Python
from typing import List, Optional
from alterlab import AlterLab
from llama_index.core import Document
from llama_index.core.readers.base import BaseReader
class AlterLabReader(BaseReader):
"""LlamaIndex reader that loads web pages via AlterLab API."""
def __init__(
self,
api_key: str,
mode: str = "auto",
output_format: str = "markdown",
):
self.client = AlterLab(api_key=api_key)
self.mode = mode
self.output_format = output_format
def load_data(
self,
urls: List[str],
mode: Optional[str] = None,
**kwargs,
) -> List[Document]:
"""Load web pages as LlamaIndex Documents."""
documents = []
scrape_mode = mode or self.mode
for url in urls:
result = self.client.scrape(
url,
formats=[self.output_format],
mode=scrape_mode,
**kwargs,
)
content = result.get(
self.output_format,
result.get("text", ""),
)
metadata = {
"source": url,
"title": result.get("metadata", {}).get("title", ""),
"status_code": result.get("metadata", {}).get("status_code"),
"credits_used": result.get("cost", {}).get("credits_charged"),
}
documents.append(
Document(text=content, metadata=metadata)
)
return documentsBasic Usage
Python
from llama_index.core import Document
# Initialize the reader
reader = AlterLabReader(api_key="your_alterlab_key")
# Load a single page
documents = reader.load_data(["https://example.com/docs"])
print(f"Loaded {len(documents)} documents")
print(f"Content length: {len(documents[0].text)} chars")
print(f"Source: {documents[0].metadata['source']}")
# Load multiple pages
urls = [
"https://example.com/docs/getting-started",
"https://example.com/docs/api-reference",
"https://example.com/docs/tutorials",
]
documents = reader.load_data(urls)
print(f"Loaded {len(documents)} documents")RAG Pipeline
Step 1: Index Web Content
Python
from llama_index.core import VectorStoreIndex, Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
# Configure LlamaIndex defaults
Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0)
Settings.embed_model = OpenAIEmbedding()
# Load pages
reader = AlterLabReader(api_key="your_alterlab_key")
documents = reader.load_data([
"https://docs.stripe.com/api/charges",
"https://docs.stripe.com/api/customers",
"https://docs.stripe.com/api/refunds",
])
# Build index
index = VectorStoreIndex.from_documents(documents)
print(f"Indexed {len(documents)} documents")Step 2: Query
Python
# Create a query engine
query_engine = index.as_query_engine(similarity_top_k=3)
# Ask questions about the indexed content
response = query_engine.query("How do I create a charge in Stripe?")
print(response)
# The response includes source nodes with metadata
for node in response.source_nodes:
print(f" Source: {node.metadata.get('source')}")
print(f" Score: {node.score:.3f}")Full Example
Complete end-to-end RAG pipeline that scrapes documentation and answers questions:
Python
from alterlab import AlterLab
from llama_index.core import Document, VectorStoreIndex, Settings
from llama_index.core.readers.base import BaseReader
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from typing import List, Optional
class AlterLabReader(BaseReader):
def __init__(self, api_key: str, mode: str = "auto"):
self.client = AlterLab(api_key=api_key)
self.mode = mode
def load_data(self, urls: List[str], **kwargs) -> List[Document]:
documents = []
for url in urls:
result = self.client.scrape(url, formats=["markdown"], mode=self.mode)
documents.append(
Document(
text=result.get("markdown", ""),
metadata={"source": url},
)
)
return documents
# 1. Configure
Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0)
Settings.embed_model = OpenAIEmbedding()
# 2. Load web pages
reader = AlterLabReader(api_key="your_alterlab_key")
documents = reader.load_data([
"https://docs.stripe.com/api/charges",
"https://docs.stripe.com/api/customers",
"https://docs.stripe.com/api/refunds",
])
# 3. Index
index = VectorStoreIndex.from_documents(documents)
# 4. Query
engine = index.as_query_engine(similarity_top_k=3)
questions = [
"How do I create a charge?",
"What parameters does the refund endpoint accept?",
"How do I list all customers?",
]
for q in questions:
response = engine.query(q)
print(f"Q: {q}")
print(f"A: {response}\n")Batch Loading
For larger document sets, extend the reader to use AlterLab's batch API:
Python
import time
from alterlab import AlterLab
from llama_index.core import Document
from typing import List
def batch_load(
api_key: str,
urls: List[str],
output_format: str = "markdown",
) -> List[Document]:
"""Load many URLs efficiently via batch scraping."""
client = AlterLab(api_key=api_key)
# Submit batch
batch = client.batch_scrape(
urls=[{"url": u, "formats": [output_format]} for u in urls],
)
# Poll until done
while True:
status = client.get_batch_status(batch["batch_id"])
if status["status"] != "processing":
break
time.sleep(2)
# Convert to Documents
documents = []
for item in status["items"]:
if item["status"] == "succeeded":
documents.append(
Document(
text=item["result"].get(output_format, ""),
metadata={"source": item["url"]},
)
)
return documents
# Use it
docs = batch_load("your_api_key", [
"https://example.com/page-1",
"https://example.com/page-2",
"https://example.com/page-3",
])
print(f"Loaded {len(docs)} documents via batch")LlamaIndex vs LangChain
Both frameworks work well with AlterLab. Here is when to use each:
| Aspect | LlamaIndex | LangChain |
|---|---|---|
| Best for | Index-first RAG, knowledge bases, document QA | Multi-step chains, agents, complex orchestration |
| Index types | Vector, list, tree, keyword — built-in | Vector stores via integrations |
| Learning curve | Simpler for RAG-focused work | More flexible but more concepts to learn |
| AlterLab setup | Custom reader (shown above) | Document loader (see LangChain guide) |
Tips & Best Practices
- Use markdown format for the best RAG results. Markdown preserves document structure (headings, lists, code blocks) which improves chunking quality.
- Set chunk overlap in your node parser. LlamaIndex defaults work well, but 200-token overlap helps with context continuity across chunks.
- Use metadata filtering — store the source URL in document metadata so you can filter queries by domain or page type.
- Persist your index to avoid re-scraping. Use
index.storage_context.persist()to save locally, or connect to a persistent vector store like Pinecone or Weaviate. - Batch load for 10+ pages — the batch API is faster and handles concurrency. See the batch loading example above.