Scrape BBC News Topic Pages and Headlines with Python
BBC News topic pages are a nice middle ground between a homepage scrape and a full article crawler.
They already group stories around one entity or theme, and the page usually exposes exactly the fields most monitoring pipelines need:
- headline text
- canonical article URL
- relative freshness like
10 mins ago - the topic name you crawled
In this tutorial we will scrape a live BBC topic page, extract the cards, and export a clean JSON dataset you can feed into alerts, summaries, or downstream article crawlers.

BBC topic pages are lightweight, but frequent news crawls still fail at the network layer first. ProxiesAPI helps when you need stable fetches across many sections, topics, or repeated runs.
What we're scraping
BBC topic pages live under URLs like:
https://www.bbc.com/news/topics/cx1m7zg01xythttps://www.bbc.com/news/topics/ce1qrvleleqt
For this guide, we'll use the United States topic page:
https://www.bbc.com/news/topics/cx1m7zg01xyt
When you inspect the HTML, the useful patterns are:
- article cards like
div[data-testid="liverpool-card"] - the main link on each card as
a[data-testid="internal-link"] - the freshness label as
span[data-testid="card-metadata-lastupdated"]
That matters because BBC uses generated class names. The data-testid hooks are much less brittle than copying a random CSS class from DevTools.
Setup
python3 -m venv .venv
source .venv/bin/activate
pip install requests beautifulsoup4 lxml
We'll use:
requestsfor HTTPBeautifulSoupfor parsingjsonfrom the standard library for export
Step 1: Fetch the topic page
Start with a regular browser-like request and real timeouts.
import requests
TOPIC_URL = "https://www.bbc.com/news/topics/cx1m7zg01xyt"
TIMEOUT = (10, 30)
session = requests.Session()
session.headers.update(
{
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/126.0.0.0 Safari/537.36"
),
"Accept-Language": "en-GB,en;q=0.9",
}
)
def fetch_html(url: str) -> str:
response = session.get(url, timeout=TIMEOUT)
response.raise_for_status()
return response.text
html = fetch_html(TOPIC_URL)
print("downloaded", len(html), "chars")
Quick sanity check from the terminal:
curl -L -s "https://www.bbc.com/news/topics/cx1m7zg01xyt" | head -n 5
If the response includes <title>United States - BBC News</title>, you're on the right page.
Step 2: Parse the topic cards
The card layout can include standard articles, live pages, and videos. For a headline dataset, it is usually better to:
- keep article and live links
- skip pure navigation links
- normalize relative URLs to absolute URLs
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
BBC_BASE = "https://www.bbc.com"
def clean(text: str | None) -> str | None:
if not text:
return None
value = " ".join(text.split())
return value or None
def is_story_path(href: str) -> bool:
if not href:
return False
return href.startswith("/news/articles/") or href.startswith("/news/live/")
def parse_topic_cards(html: str, topic_name: str) -> list[dict]:
soup = BeautifulSoup(html, "lxml")
rows: list[dict] = []
seen = set()
for card in soup.select('div[data-testid="liverpool-card"]'):
link = card.select_one('a[data-testid="internal-link"][href]')
if not link:
continue
href = link.get("href", "")
if not is_story_path(href):
continue
url = urljoin(BBC_BASE, href)
if url in seen:
continue
title_node = card.select_one("h2")
title = clean(title_node.get_text(" ", strip=True) if title_node else None)
if not title:
continue
summary_node = card.select_one("p")
summary = clean(summary_node.get_text(" ", strip=True) if summary_node else None)
updated_node = card.select_one('[data-testid="card-metadata-lastupdated"]')
updated = clean(updated_node.get_text(" ", strip=True) if updated_node else None)
rows.append(
{
"topic": topic_name,
"headline": title,
"url": url,
"updated": updated,
"summary": summary,
}
)
seen.add(url)
return rows
Why data-testid is the right anchor here
If you inspect BBC's HTML today, you'll see class names like:
IndexCard-styles__IndexCardStyled-sc-...Liverpool-styles__CardStyled-sc-...
Those are implementation details, not contracts. The data-testid attributes are not perfect forever either, but they tend to survive styling refactors much better.
Step 3: Export the results to JSON
import json
from datetime import datetime, timezone
def export_json(rows: list[dict], path: str) -> None:
payload = {
"scraped_at": datetime.now(timezone.utc).isoformat(),
"count": len(rows),
"items": rows,
}
with open(path, "w", encoding="utf-8") as f:
json.dump(payload, f, ensure_ascii=False, indent=2)
if __name__ == "__main__":
html = fetch_html(TOPIC_URL)
rows = parse_topic_cards(html, topic_name="United States")
export_json(rows, "bbc_topic_headlines.json")
print("saved", len(rows), "rows")
print(rows[:3])
Example output:
{
"topic": "United States",
"headline": "Trump faces pressure over ceasefire claims",
"url": "https://www.bbc.com/news/articles/...",
"updated": "10 mins ago",
"summary": "..."
}
Full script
import json
import requests
from datetime import datetime, timezone
from bs4 import BeautifulSoup
from urllib.parse import urljoin
TOPIC_URL = "https://www.bbc.com/news/topics/cx1m7zg01xyt"
BBC_BASE = "https://www.bbc.com"
TIMEOUT = (10, 30)
session = requests.Session()
session.headers.update(
{
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/126.0.0.0 Safari/537.36"
),
"Accept-Language": "en-GB,en;q=0.9",
}
)
def clean(text):
if not text:
return None
value = " ".join(text.split())
return value or None
def fetch_html(url):
response = session.get(url, timeout=TIMEOUT)
response.raise_for_status()
return response.text
def is_story_path(href):
return href.startswith("/news/articles/") or href.startswith("/news/live/")
def parse_topic_cards(html, topic_name):
soup = BeautifulSoup(html, "lxml")
rows = []
seen = set()
for card in soup.select('div[data-testid="liverpool-card"]'):
link = card.select_one('a[data-testid="internal-link"][href]')
if not link:
continue
href = link.get("href", "")
if not is_story_path(href):
continue
url = urljoin(BBC_BASE, href)
if url in seen:
continue
title_node = card.select_one("h2")
title = clean(title_node.get_text(" ", strip=True) if title_node else None)
if not title:
continue
summary_node = card.select_one("p")
updated_node = card.select_one('[data-testid="card-metadata-lastupdated"]')
rows.append(
{
"topic": topic_name,
"headline": title,
"url": url,
"updated": clean(updated_node.get_text(" ", strip=True) if updated_node else None),
"summary": clean(summary_node.get_text(" ", strip=True) if summary_node else None),
}
)
seen.add(url)
return rows
def export_json(rows, path):
payload = {
"scraped_at": datetime.now(timezone.utc).isoformat(),
"count": len(rows),
"items": rows,
}
with open(path, "w", encoding="utf-8") as f:
json.dump(payload, f, ensure_ascii=False, indent=2)
def main():
html = fetch_html(TOPIC_URL)
rows = parse_topic_cards(html, topic_name="United States")
export_json(rows, "bbc_topic_headlines.json")
print(f"saved {len(rows)} rows")
if __name__ == "__main__":
main()
Practical upgrades
Once the base scraper works, the next useful improvements are:
- crawl multiple topic pages from a seed list
- store a stable article ID extracted from the path
- enrich each article by fetching the article page separately
- deduplicate by URL so repeat runs only alert on new items
If you want to scale beyond one page, keep the topic crawl and the article crawl as separate jobs. Topic pages tell you what is new. Article pages tell you what is inside.
Where ProxiesAPI fits
One BBC topic page usually works fine without a proxy.
But that is not the real production use case. Teams usually want:
- dozens of topics
- repeated crawls every few minutes
- multiple geographies or environments
That is where ProxiesAPI becomes useful. It does not change your parsing logic. It helps keep your fetch layer stable when request volume, retries, and IP reputation become the failure point.
The honest rule is simple:
- for one-off testing, scrape directly
- for recurring crawls at scale, stabilize the network layer first
That's the point where a proxy-backed fetch pipeline starts paying for itself.
BBC topic pages are lightweight, but frequent news crawls still fail at the network layer first. ProxiesAPI helps when you need stable fetches across many sections, topics, or repeated runs.