Build an AI Research Agent
Build an agent that searches the web, scrapes the top results, extracts structured data, and produces a research summary — all through the AlterLab API.
Time Estimate
What You'll Build
An AI research agent that follows four steps:
Search
Find relevant pages on the web using the AlterLab Search API.
Scrape
Scrape each result page to get the full content.
Extract
Pull structured data (key facts, dates, authors) from each page using a JSON schema.
Summarize
Feed the extracted data to an LLM to produce a research summary with citations.
Prerequisites
- An AlterLab API key with credits
- An OpenAI API key (for the LLM summarization step — you can substitute any LLM provider)
- Python 3.10+ or Node.js 18+
pip install requests openaiStep 1: Search for Sources
Start by searching for pages relevant to your research topic. We use time_range: "month" to prioritize recent content and scrape_results: true to fetch full text in one call.
import requests
import time
ALTERLAB_KEY = "YOUR_ALTERLAB_API_KEY"
BASE = "https://api.alterlab.io/api/v1"
def search_and_scrape(topic: str, num_sources: int = 5) -> dict:
"""Search for a topic and scrape the results."""
response = requests.post(
f"{BASE}/search",
headers={"X-API-Key": ALTERLAB_KEY},
json={
"query": topic,
"num_results": num_sources,
"time_range": "month",
"scrape_results": True,
"formats": ["text"],
"extraction_schema": {
"type": "object",
"properties": {
"title": {
"type": "string",
"description": "Article or page title"
},
"author": {
"type": "string",
"description": "Author name if available"
},
"date_published": {
"type": "string",
"description": "Publication date in YYYY-MM-DD format"
},
"key_findings": {
"type": "array",
"items": {"type": "string"},
"description": "3-5 main findings, facts, or claims"
},
"methodology": {
"type": "string",
"description": "Research methodology if mentioned"
}
},
"required": ["title", "key_findings"]
}
}
)
data = response.json()
# Poll if async (> 5 results)
if response.status_code == 202:
search_id = data["search_id"]
while True:
status = requests.get(
f"{BASE}/search/{search_id}",
headers={"X-API-Key": ALTERLAB_KEY}
).json()
if status["status"] == "completed":
return status
time.sleep(2)
return data
# Run it
results = search_and_scrape("large language model efficiency techniques 2026")
print(f"Found {results['results_count']} sources")Step 2: Collect Extracted Data
The search + scrape call already extracted structured data. Now collect it into a clean list of sources:
def collect_sources(search_results: dict) -> list[dict]:
"""Collect extracted data from search results into a source list."""
sources = []
for result in search_results.get("results", []):
content = result.get("content") or {}
extraction = content.get("extraction") or {}
source = {
"url": result["url"],
"title": extraction.get("title", result["title"]),
"author": extraction.get("author", "Unknown"),
"date": extraction.get("date_published"),
"findings": extraction.get("key_findings", []),
"methodology": extraction.get("methodology"),
"text_preview": (content.get("text") or "")[:500],
}
sources.append(source)
return sources
sources = collect_sources(results)
# Preview what we collected
for i, s in enumerate(sources, 1):
print(f"\n[{i}] {s['title']}")
print(f" URL: {s['url']}")
print(f" Author: {s['author']} | Date: {s['date']}")
print(f" Findings: {len(s['findings'])} key points")
for f in s["findings"]:
print(f" - {f}")Step 3: Summarize with an LLM
Feed the extracted data to an LLM to produce a research summary with proper citations. We use OpenAI here, but any LLM works.
from openai import OpenAI
openai_client = OpenAI(api_key="YOUR_OPENAI_API_KEY")
def summarize_research(topic: str, sources: list[dict]) -> str:
"""Generate a research summary from collected sources."""
# Build context from sources
source_texts = []
for i, s in enumerate(sources, 1):
findings = "\n".join(f" - {f}" for f in s["findings"])
source_texts.append(
f"[Source {i}] {s['title']}\n"
f"URL: {s['url']}\n"
f"Author: {s['author']} | Date: {s['date']}\n"
f"Key findings:\n{findings}\n"
f"Text preview: {s['text_preview']}"
)
context = "\n\n---\n\n".join(source_texts)
response = openai_client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": (
"You are a research analyst. Synthesize the provided "
"sources into a clear, well-structured research summary. "
"Cite sources using [Source N] notation. Highlight areas "
"of agreement and disagreement between sources."
)
},
{
"role": "user",
"content": (
f"Research topic: {topic}\n\n"
f"Sources:\n\n{context}\n\n"
"Write a research summary (300-500 words) with:\n"
"1. Executive summary (2-3 sentences)\n"
"2. Key findings (organized by theme)\n"
"3. Areas of disagreement\n"
"4. Conclusion and recommended next steps"
)
}
],
temperature=0.3,
)
return response.choices[0].message.content
summary = summarize_research(
"large language model efficiency techniques 2026",
sources
)
print(summary)Full Agent Code
Here is the complete research agent combining all three steps:
"""AI Research Agent — Search, scrape, extract, and summarize."""
import requests
import time
from openai import OpenAI
# Configuration
ALTERLAB_KEY = "YOUR_ALTERLAB_API_KEY"
OPENAI_KEY = "YOUR_OPENAI_API_KEY"
BASE = "https://api.alterlab.io/api/v1"
openai_client = OpenAI(api_key=OPENAI_KEY)
EXTRACTION_SCHEMA = {
"type": "object",
"properties": {
"title": {"type": "string", "description": "Article title"},
"author": {"type": "string", "description": "Author name"},
"date_published": {"type": "string", "description": "Date in YYYY-MM-DD"},
"key_findings": {
"type": "array",
"items": {"type": "string"},
"description": "3-5 main findings or claims"
},
"methodology": {"type": "string", "description": "Research method if mentioned"}
},
"required": ["title", "key_findings"]
}
def research(topic: str, num_sources: int = 5) -> str:
"""Run the full research pipeline: search -> extract -> summarize."""
print(f"Researching: {topic}")
print(f"Searching for {num_sources} sources...\n")
# Step 1: Search + Scrape + Extract
response = requests.post(
f"{BASE}/search",
headers={"X-API-Key": ALTERLAB_KEY},
json={
"query": topic,
"num_results": num_sources,
"time_range": "month",
"scrape_results": True,
"formats": ["text"],
"extraction_schema": EXTRACTION_SCHEMA
}
)
data = response.json()
# Poll if needed
if response.status_code == 202:
search_id = data["search_id"]
while True:
status = requests.get(
f"{BASE}/search/{search_id}",
headers={"X-API-Key": ALTERLAB_KEY}
).json()
print(f" Scraping: {status['completed']}/{status['results_count']} done")
if status["status"] == "completed":
data = status
break
time.sleep(2)
# Step 2: Collect sources
sources = []
for result in data.get("results", []):
content = result.get("content") or {}
ext = content.get("extraction") or {}
sources.append({
"url": result["url"],
"title": ext.get("title", result["title"]),
"author": ext.get("author", "Unknown"),
"date": ext.get("date_published"),
"findings": ext.get("key_findings", []),
"methodology": ext.get("methodology"),
"text": (content.get("text") or "")[:1000],
})
print(f"\nCollected {len(sources)} sources:")
for i, s in enumerate(sources, 1):
print(f" [{i}] {s['title']}")
# Step 3: Summarize
print("\nGenerating summary...\n")
source_texts = []
for i, s in enumerate(sources, 1):
findings = "\n".join(f" - {f}" for f in s["findings"])
source_texts.append(
f"[Source {i}] {s['title']}\n"
f"URL: {s['url']}\n"
f"Author: {s['author']} | Date: {s['date']}\n"
f"Findings:\n{findings}\n"
f"Text: {s['text']}"
)
context = "\n\n---\n\n".join(source_texts)
summary = openai_client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": (
"You are a research analyst. Synthesize sources into a "
"structured summary. Cite with [Source N]. Highlight "
"agreements and disagreements."
)
},
{
"role": "user",
"content": (
f"Topic: {topic}\n\n{context}\n\n"
"Write a 300-500 word summary with:\n"
"1. Executive summary\n"
"2. Key findings by theme\n"
"3. Disagreements\n"
"4. Conclusion"
)
}
],
temperature=0.3,
).choices[0].message.content
# Add source list
refs = "\n".join(
f"[{i}] {s['title']} — {s['url']}"
for i, s in enumerate(sources, 1)
)
return f"{summary}\n\n---\nSources:\n{refs}"
if __name__ == "__main__":
report = research("large language model efficiency techniques 2026")
print("=" * 60)
print(report)LangChain Integration
Wrap the AlterLab search as a LangChain tool for use in agent chains:
pip install langchain langchain-openaifrom langchain.tools import tool
from langchain_openai import ChatOpenAI
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain.prompts import ChatPromptTemplate
import requests
import time
ALTERLAB_KEY = "YOUR_ALTERLAB_API_KEY"
BASE = "https://api.alterlab.io/api/v1"
@tool
def web_search(query: str) -> str:
"""Search the web and return relevant page content. Use this to find
information about any topic. Returns titles, URLs, and key findings."""
response = requests.post(
f"{BASE}/search",
headers={"X-API-Key": ALTERLAB_KEY},
json={
"query": query,
"num_results": 5,
"time_range": "month",
"scrape_results": True,
"formats": ["text"],
"extraction_schema": {
"type": "object",
"properties": {
"title": {"type": "string"},
"key_findings": {
"type": "array",
"items": {"type": "string"},
"description": "3-5 key facts"
}
},
"required": ["title", "key_findings"]
}
}
)
data = response.json()
# Poll if async
if response.status_code == 202:
search_id = data["search_id"]
for _ in range(30):
status = requests.get(
f"{BASE}/search/{search_id}",
headers={"X-API-Key": ALTERLAB_KEY}
).json()
if status["status"] == "completed":
data = status
break
time.sleep(2)
# Format results for the agent
output = []
for r in data.get("results", []):
ext = (r.get("content") or {}).get("extraction") or {}
findings = ext.get("key_findings", [])
findings_str = "; ".join(findings) if findings else "No findings extracted"
output.append(
f"- {r['title']} ({r['url']}): {findings_str}"
)
return "\n".join(output) if output else "No results found."
@tool
def scrape_page(url: str) -> str:
"""Scrape a specific URL and return its text content."""
response = requests.post(
f"{BASE}/scrape",
headers={"X-API-Key": ALTERLAB_KEY},
json={"url": url, "formats": ["text"]}
)
data = response.json()
return (data.get("text") or "")[:3000]
# Create the agent
llm = ChatOpenAI(model="gpt-4o", temperature=0.3)
tools = [web_search, scrape_page]
prompt = ChatPromptTemplate.from_messages([
("system",
"You are a research assistant. Use the web_search tool to find "
"information, and scrape_page for deeper reads. Always cite sources."),
("human", "{input}"),
("placeholder", "{agent_scratchpad}"),
])
agent = create_tool_calling_agent(llm, tools, prompt)
executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
# Run the agent
result = executor.invoke({
"input": "Research the latest advances in quantum error correction. "
"Find at least 3 sources and summarize the key breakthroughs."
})
print(result["output"])Node.js Version
Here is the same research agent in TypeScript:
import OpenAI from "openai";
const ALTERLAB_KEY = "YOUR_ALTERLAB_API_KEY";
const BASE = "https://api.alterlab.io/api/v1";
const openai = new OpenAI({ apiKey: "YOUR_OPENAI_API_KEY" });
interface Source {
url: string;
title: string;
author: string;
date: string | null;
findings: string[];
text: string;
}
async function research(topic: string, numSources = 5): Promise<string> {
console.log(`Researching: ${topic}`);
// Step 1: Search + Scrape + Extract
const searchRes = await fetch(`${BASE}/search`, {
method: "POST",
headers: {
"X-API-Key": ALTERLAB_KEY,
"Content-Type": "application/json",
},
body: JSON.stringify({
query: topic,
num_results: numSources,
time_range: "month",
scrape_results: true,
formats: ["text"],
extraction_schema: {
type: "object",
properties: {
title: { type: "string" },
author: { type: "string" },
date_published: { type: "string" },
key_findings: {
type: "array",
items: { type: "string" },
description: "3-5 key findings",
},
},
required: ["title", "key_findings"],
},
}),
});
let data = await searchRes.json();
// Poll if async
if (searchRes.status === 202) {
const searchId = data.search_id;
while (true) {
await new Promise((r) => setTimeout(r, 2000));
const status = await fetch(`${BASE}/search/${searchId}`, {
headers: { "X-API-Key": ALTERLAB_KEY },
}).then((r) => r.json());
console.log(` Scraping: ${status.completed}/${status.results_count}`);
if (status.status === "completed") {
data = status;
break;
}
}
}
// Step 2: Collect sources
const sources: Source[] = data.results.map((r: any) => {
const ext = r.content?.extraction ?? {};
return {
url: r.url,
title: ext.title ?? r.title,
author: ext.author ?? "Unknown",
date: ext.date_published ?? null,
findings: ext.key_findings ?? [],
text: (r.content?.text ?? "").slice(0, 1000),
};
});
console.log(`\nCollected ${sources.length} sources`);
// Step 3: Summarize with LLM
const context = sources
.map(
(s, i) =>
`[Source ${i + 1}] ${s.title}\n` +
`URL: ${s.url}\n` +
`Findings:\n${s.findings.map((f) => ` - ${f}`).join("\n")}\n` +
`Text: ${s.text}`
)
.join("\n\n---\n\n");
const completion = await openai.chat.completions.create({
model: "gpt-4o",
messages: [
{
role: "system",
content:
"Synthesize sources into a structured summary. Cite with [Source N].",
},
{
role: "user",
content: `Topic: ${topic}\n\n${context}\n\nWrite a 300-500 word summary.`,
},
],
temperature: 0.3,
});
const summary = completion.choices[0].message.content;
const refs = sources
.map((s, i) => `[${i + 1}] ${s.title} — ${s.url}`)
.join("\n");
return `${summary}\n\n---\nSources:\n${refs}`;
}
// Run it
const report = await research("large language model efficiency techniques 2026");
console.log(report);Next Steps
Add Multi-Query Research
Run multiple searches with different queries and combine the results for more comprehensive research. Use different time_range values for historical perspective.
Add Domain-Specific Search
Use the domain parameter to search specific sites (e.g., arxiv.org for papers, news.ycombinator.com for tech discussions).
Use Batch for Scale
For large-scale research, use Batch Scraping to process many URLs in parallel after the initial search.
Explore the Search Guide
See the Search Guide for more patterns: geo-targeted search, time-filtered discovery, and competitive analysis.