E-commerce Scraping
Learn how to scrape product data, monitor prices, and extract structured information from e-commerce websites.
Legal Considerations
Common Use Cases
Price Monitoring
Track competitor prices, detect sales, and optimize your pricing strategy.
Product Research
Analyze product catalogs, features, and market trends.
Inventory Tracking
Monitor stock levels and availability across retailers.
Review Aggregation
Collect and analyze customer reviews and ratings.
Scraping Product Pages
Use extraction_schema to get structured product data matching your desired format:
import requests
def scrape_product(url):
"""Scrape a single product page with schema filtering."""
response = requests.post(
"https://api.alterlab.io/api/v1/scrape",
headers={"X-API-Key": "YOUR_API_KEY"},
json={
"url": url,
"extraction_schema": {
"type": "object",
"properties": {
"name": {"type": "string"},
"price": {"type": "number"},
"currency": {"type": "string"},
"in_stock": {"type": "boolean"},
"rating": {"type": "number"},
"images": {"type": "array"}
}
}
}
)
data = response.json()
if response.status_code == 200 and data.get("filtered_content"):
return data["filtered_content"]
return None
# Example usage
product = scrape_product("https://amazon.com/dp/B0123456789")
if product:
print(f"{product['name']}: ${product['price']}")Common Product Fields
Request these fields in your schema. Field aliases are automatically resolved (e.g., in_stock → availability):
namepricecurrencyratingin_stockdescriptionimagesbrandskuScraping Category Pages
For category/listing pages, the raw content contains product listings. Use extraction_schema to filter the structured data:
import requests
def scrape_category(url):
"""Scrape products from a category page."""
response = requests.post(
"https://api.alterlab.io/api/v1/scrape",
headers={"X-API-Key": "YOUR_API_KEY"},
json={
"url": url,
"extraction_schema": {
"type": "object",
"properties": {
"name": {"type": "string"},
"price": {"type": "number"},
"image": {"type": "string"},
"rating": {"type": "number"}
}
}
}
)
data = response.json()
if response.status_code == 200:
# filtered_content contains data matching your schema
return data.get("filtered_content", {})
return {}
# Scrape a category page
product = scrape_category("https://amazon.com/dp/B0123456789")
if product:
print(f"Product: {product.get('name')}")
print(f"Price: ${product.get('price')}")Tip: Batch Scraping
Price Monitoring
Set up automated price tracking with caching and webhooks:
import requests
from datetime import datetime
class PriceMonitor:
def __init__(self, api_key):
self.api_key = api_key
self.base_url = "https://api.alterlab.io/api/v1/scrape"
def check_price(self, url, product_name=None):
"""Check current price of a product."""
response = requests.post(
self.base_url,
headers={"X-API-Key": self.api_key},
json={
"url": url,
"extraction_schema": {
"type": "object",
"properties": {
"price": {"type": "number"},
"original_price": {"type": "number"},
"in_stock": {"type": "boolean"}
}
},
# Don't cache for price monitoring
"cache": False
}
)
data = response.json()
filtered = data.get("filtered_content", {})
if response.status_code == 200 and filtered:
return {
"url": url,
"product_name": product_name,
"price": filtered.get("price"),
"original_price": filtered.get("original_price"),
"in_stock": filtered.get("in_stock", True),
"checked_at": datetime.now().isoformat()
}
return None
def compare_prices(self, urls):
"""Compare prices across multiple retailers."""
results = []
for url in urls:
price_data = self.check_price(url)
if price_data:
results.append(price_data)
# Sort by price
return sorted(results, key=lambda x: x.get("price") or float('inf'))
# Usage
monitor = PriceMonitor("YOUR_API_KEY")
# Track a single product
price = monitor.check_price(
"https://amazon.com/dp/B0123456789",
"iPhone 15 Pro"
)
if price:
print(f"Current price: ${price['price']}")
# Compare across retailers
urls = [
"https://amazon.com/dp/B0123456789",
"https://bestbuy.com/product/123",
"https://walmart.com/ip/456"
]
comparison = monitor.compare_prices(urls)
if comparison:
print(f"Best price: ${comparison[0]['price']} at {comparison[0]['url']}")Scheduling
Handling Product Variants
Extract variant information (sizes, colors, configurations):
import requests
def scrape_product_with_variants(url):
"""Scrape product including variant data."""
response = requests.post(
"https://api.alterlab.io/api/v1/scrape",
headers={"X-API-Key": "YOUR_API_KEY"},
json={
"url": url,
"advanced": {
"render_js": True # Often needed for variant data
},
"extraction_schema": {
"type": "object",
"properties": {
"name": {"type": "string"},
"price": {"type": "number"},
"in_stock": {"type": "boolean"},
"images": {"type": "array"}
}
}
}
)
data = response.json()
return data.get("filtered_content", {})
# Example: Scrape a product
product = scrape_product_with_variants("https://amazon.com/dp/B0123456789")
if product:
print(f"Product: {product.get('name')}")
print(f"Price: ${product.get('price')}")
print(f"In Stock: {product.get('in_stock')}")Variant Data
content response when scraping e-commerce sites. The schema filter extracts the fields you specify from the structured data.Batch Scraping Products
Scrape multiple products efficiently using the batch API:
import requests
def batch_scrape_products(urls, webhook_url=None):
"""Scrape multiple products in a single batch request."""
product_schema = {
"type": "object",
"properties": {
"name": {"type": "string"},
"price": {"type": "number"},
"in_stock": {"type": "boolean"}
}
}
requests_list = [
{
"url": url,
"extraction_schema": product_schema
}
for url in urls
]
response = requests.post(
"https://api.alterlab.io/api/v1/batch",
headers={"X-API-Key": "YOUR_API_KEY"},
json={
"requests": requests_list,
"webhook_url": webhook_url # Optional: get notified when complete
}
)
data = response.json()
return data.get("job_id")
# Submit batch of products
product_urls = [
"https://amazon.com/dp/B0123456789",
"https://amazon.com/dp/B0987654321",
# ... more URLs
]
job_id = batch_scrape_products(
product_urls,
webhook_url="https://myapp.com/webhooks/scrape-complete"
)
print(f"Batch job submitted: {job_id}")
# Poll for results or wait for webhook
def get_batch_results(job_id):
response = requests.get(
f"https://api.alterlab.io/api/v1/jobs/{job_id}",
headers={"X-API-Key": "YOUR_API_KEY"}
)
return response.json()Batch Limits
Handling Anti-Bot Protection
E-commerce sites often have strong anti-bot measures. Here's how AlterLab handles them:
Automatic Tier Escalation
AlterLab automatically escalates through anti-bot tiers when needed. Start with mode: "auto" and we'll use the minimum tier required.
JavaScript Rendering
Many e-commerce sites require JS. Enable with render_js: true if you see empty or blocked responses.
Cost Controls
Limit costs with max_tier to prevent expensive CAPTCHA-solving tier:
{
"url": "https://shop.example.com/product/123",
"cost_controls": {
"max_tier": "4", // Stop before CAPTCHA tier
"max_cost": 0.01
}
}Best Practices
Do
- Use caching for product pages that don't change frequently
- Batch similar requests together
- Start with auto mode and let AlterLab optimize
- Handle missing data gracefully (not all products have all fields)
- Respect rate limits and space out requests
Don't
- Scrape too aggressively (can get your IP blocked)
- Ignore robots.txt or terms of service
- Store personal customer data
- Use JS rendering when not needed (increases costs)
- Forget to validate extracted data