News Monitoring
Build a news aggregation and monitoring system. Track mentions, analyze sentiment, and stay on top of industry news.
Use Cases
Brand Monitoring
Track mentions of your brand, products, or competitors across news sources.
Market Intelligence
Monitor industry trends, competitor announcements, and market movements.
Content Aggregation
Build news feeds, newsletters, or content curation platforms.
Research & Analysis
Collect articles for academic research, sentiment analysis, or trend detection.
Scraping Articles
Use the article extraction profile for news content:
import requests
from datetime import datetime
def scrape_article(url):
"""Extract structured data from a news article."""
response = requests.post(
"https://api.alterlab.io/api/v1/scrape",
headers={"X-API-Key": "YOUR_API_KEY"},
json={
"url": url,
"extraction_profile": "article",
"formats": ["markdown"] # Get clean markdown content
}
)
data = response.json()
if data.get("success"):
article = data["extracted"]
return {
"title": article.get("title"),
"author": article.get("author"),
"published_date": article.get("published_date"),
"content": article.get("content"),
"summary": article.get("summary"),
"categories": article.get("categories", []),
"source_url": url,
"scraped_at": datetime.now().isoformat()
}
return None
# Example
article = scrape_article("https://techcrunch.com/2024/01/15/example-article")
print(f"Title: {article['title']}")
print(f"Author: {article['author']}")
print(f"Published: {article['published_date']}")RSS Feed Alternative
Many news sites no longer provide RSS feeds. Scrape their homepage or category pages to build your own feed:
import requests
def get_latest_articles(source_url):
"""Scrape latest articles from a news homepage."""
response = requests.post(
"https://api.alterlab.io/api/v1/scrape",
headers={"X-API-Key": "YOUR_API_KEY"},
json={
"url": source_url,
"extraction_schema": {
"type": "object",
"properties": {
"articles": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"url": {"type": "string"},
"summary": {"type": "string"},
"timestamp": {"type": "string"},
"author": {"type": "string"},
"category": {"type": "string"},
"image_url": {"type": "string"}
}
}
}
}
},
"extraction_prompt": "Extract all article links from this news homepage. Include the full URL, title, summary/description, publication time, and category if visible.",
"cache": True,
"cache_ttl": 300 # Cache for 5 minutes
}
)
return response.json().get("extracted", {}).get("articles", [])
# Build a multi-source feed
sources = [
"https://techcrunch.com",
"https://www.theverge.com",
"https://arstechnica.com"
]
all_articles = []
for source in sources:
articles = get_latest_articles(source)
all_articles.extend(articles)
# Sort by timestamp
all_articles.sort(key=lambda x: x.get('timestamp', ''), reverse=True)
print(f"Aggregated {len(all_articles)} articles from {len(sources)} sources")Sentiment Analysis
Use natural language extraction to analyze article sentiment:
import requests
def analyze_article_sentiment(url):
"""Scrape article and analyze sentiment."""
response = requests.post(
"https://api.alterlab.io/api/v1/scrape",
headers={"X-API-Key": "YOUR_API_KEY"},
json={
"url": url,
"extraction_schema": {
"type": "object",
"properties": {
"title": {"type": "string"},
"summary": {"type": "string"},
"sentiment": {
"type": "string",
"enum": ["positive", "negative", "neutral", "mixed"]
},
"sentiment_score": {
"type": "number",
"description": "Score from -1 (very negative) to 1 (very positive)"
},
"key_points": {
"type": "array",
"items": {"type": "string"},
"description": "Main points or takeaways"
},
"entities_mentioned": {
"type": "array",
"items": {"type": "string"},
"description": "Companies, people, or products mentioned"
}
}
},
"extraction_prompt": "Analyze this article. Determine the overall sentiment toward the main subject. Extract key points and notable entities mentioned."
}
)
return response.json().get("extracted", {})
# Analyze sentiment for brand monitoring
article = analyze_article_sentiment("https://news.example.com/article/123")
print(f"Title: {article['title']}")
print(f"Sentiment: {article['sentiment']} ({article['sentiment_score']})")
print(f"Key Points: {article['key_points']}")
print(f"Entities: {article['entities_mentioned']}")Topic Extraction
Categorize and tag articles automatically:
import requests
def extract_topics(url, custom_topics=None):
"""Extract topics and tags from an article."""
prompt = "Categorize this article and extract relevant topics/tags."
if custom_topics:
prompt += f" Use these categories when applicable: {', '.join(custom_topics)}"
response = requests.post(
"https://api.alterlab.io/api/v1/scrape",
headers={"X-API-Key": "YOUR_API_KEY"},
json={
"url": url,
"extraction_schema": {
"type": "object",
"properties": {
"title": {"type": "string"},
"primary_category": {"type": "string"},
"subcategories": {
"type": "array",
"items": {"type": "string"}
},
"tags": {
"type": "array",
"items": {"type": "string"}
},
"reading_time_minutes": {"type": "integer"},
"content_type": {
"type": "string",
"enum": ["news", "opinion", "analysis", "tutorial", "review", "interview"]
}
}
},
"extraction_prompt": prompt
}
)
return response.json().get("extracted", {})
# Use predefined categories
my_categories = ["AI/ML", "Startups", "Fintech", "Crypto", "Enterprise", "Consumer Tech"]
article = extract_topics(
"https://techcrunch.com/article/123",
custom_topics=my_categories
)
print(f"Category: {article['primary_category']}")
print(f"Tags: {', '.join(article['tags'])}")
print(f"Type: {article['content_type']}")Building a Monitoring System
Combine scraping with webhooks and scheduling for automated monitoring:
import requests
from datetime import datetime, timedelta
class NewsMonitor:
def __init__(self, api_key, keywords):
self.api_key = api_key
self.keywords = keywords
self.sources = []
self.seen_articles = set()
def add_source(self, url, name):
"""Add a news source to monitor."""
self.sources.append({"url": url, "name": name})
def check_sources(self):
"""Check all sources for new articles mentioning keywords."""
new_articles = []
for source in self.sources:
articles = self._scrape_source(source["url"])
for article in articles:
# Check if article is new
article_id = article.get("url", article.get("title"))
if article_id in self.seen_articles:
continue
# Check if article matches keywords
text = f"{article.get('title', '')} {article.get('summary', '')}".lower()
matching_keywords = [kw for kw in self.keywords if kw.lower() in text]
if matching_keywords:
article["source"] = source["name"]
article["matching_keywords"] = matching_keywords
new_articles.append(article)
self.seen_articles.add(article_id)
return new_articles
def _scrape_source(self, url):
"""Scrape articles from a source."""
response = requests.post(
"https://api.alterlab.io/api/v1/scrape",
headers={"X-API-Key": self.api_key},
json={
"url": url,
"extraction_schema": {
"type": "object",
"properties": {
"articles": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"url": {"type": "string"},
"summary": {"type": "string"},
"timestamp": {"type": "string"}
}
}
}
}
},
"cache": True,
"cache_ttl": 300
}
)
data = response.json()
return data.get("extracted", {}).get("articles", [])
# Usage
monitor = NewsMonitor(
api_key="YOUR_API_KEY",
keywords=["AlterLab", "web scraping", "data extraction"]
)
monitor.add_source("https://techcrunch.com", "TechCrunch")
monitor.add_source("https://www.wired.com", "Wired")
monitor.add_source("https://venturebeat.com", "VentureBeat")
# Run periodically (e.g., every 15 minutes)
new_mentions = monitor.check_sources()
for article in new_mentions:
print(f"New mention: {article['title']}")
print(f" Source: {article['source']}")
print(f" Keywords: {article['matching_keywords']}")
print(f" URL: {article['url']}")Automation
Handling Paywalls
Many news sites have paywalls. Here's how to work with them:
Metered Paywalls
Sites that allow a few free articles. AlterLab's rotating sessions help avoid hitting limits, but respect the site's terms.
Soft Paywalls
Content visible in HTML but hidden by JavaScript. Use render_js: false to get raw HTML before paywall triggers.
Hard Paywalls
Content not available without subscription. You'll only get preview/summary content. Consider using the site's official API if available.
Legal Notice
Best Practices
1. Use Caching Wisely
Cache homepage/category scrapes for 5-15 minutes to reduce costs. Don't cache individual article scrapes if you need real-time content.
2. Deduplicate Articles
Track seen articles by URL to avoid processing duplicates. News sites often keep articles on the homepage for hours.
3. Handle Rate Limits
Space out requests to each source. Don't hammer a single site with hundreds of requests per minute.
4. Store Raw Content
Save the raw scraped content along with extracted data. This lets you re-process articles later with updated extraction logic.
5. Respect Attribution
If you display aggregated content, always link back to the original source and credit the author.