Google News Scraping: Build a Custom News Aggregator
Google News is a strong input for a personal news radar, especially if you want your own topic definitions, deduplication, and a consistent daily export.
This tutorial shows a practical approach to scraping public Google News search results and turning them into a clean feed.
Note: scraping Google properties can be fragile and may be subject to rate limits and terms. Keep volume reasonable, cache while developing, and prefer official feeds when available.
Google News pages can throttle or vary by region. ProxiesAPI helps keep your fetch layer consistent when you run many topic queries on a schedule.
What we are scraping
We use the Google News search URL:
https://news.google.com/search?q=YOUR_TOPIC&hl=en-US&gl=US&ceid=US:en
The page contains multiple article cards. Inside each card you can usually find:
- headline link
- publisher
- time
Our pipeline:
- build a topic URL
- fetch HTML (direct first, ProxiesAPI optional)
- parse cards into structured records
- dedupe and export
Setup
python3 -m venv .venv
source .venv/bin/activate
pip install requests beautifulsoup4 lxml python-dateutil
Step 1: Fetch HTML (direct and ProxiesAPI option)
import random
import time
from urllib.parse import quote_plus
import requests
TIMEOUT = (10, 60)
def proxiesapi_url(target_url: str, api_key: str) -> str:
return f"http://api.proxiesapi.com/?key={quote_plus(api_key)}&url={quote_plus(target_url)}"
def fetch_html(url: str, *, proxiesapi_key: str | None = None, max_attempts: int = 6) -> str:
session = requests.Session()
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36"
),
"Accept-Language": "en-US,en;q=0.9",
}
last_err = None
for attempt in range(1, max_attempts + 1):
try:
target = proxiesapi_url(url, proxiesapi_key) if proxiesapi_key else url
r = session.get(target, headers=headers, timeout=TIMEOUT)
if r.status_code >= 400:
raise requests.HTTPError(f"HTTP {r.status_code}")
return r.text or ""
except Exception as e:
last_err = e
time.sleep(min(40, 2 ** attempt) + random.random())
raise RuntimeError(f"fetch failed: {last_err}")
Step 2: Build a topic search URL
from urllib.parse import urlencode
def news_search_url(topic: str, *, hl: str = "en-US", gl: str = "US", ceid: str = "US:en") -> str:
base = "https://news.google.com/search"
qs = urlencode({"q": topic, "hl": hl, "gl": gl, "ceid": ceid})
return f"{base}?{qs}"
Step 3: Parse article cards (selectors plus normalization)
Google News markup changes, so we target article elements and then look for a headline link inside common heading tags.
import re
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from dateutil import parser as date_parser
BASE = "https://news.google.com"
def clean_ws(s: str) -> str:
return re.sub(r"\\s+", " ", (s or "").strip())
def normalize_href(href: str) -> str:
if not href:
return href
if href.startswith("./"):
return urljoin(BASE + "/", href[2:])
if href.startswith("/"):
return urljoin(BASE, href)
return href
def parse_news_results(html: str) -> list[dict]:
soup = BeautifulSoup(html, "lxml")
out: list[dict] = []
for article in soup.select("article"):
a = article.select_one("h3 a, h4 a, a[href^=\"./articles/\"], a[href*=\"/articles/\"]")
href = normalize_href(a.get("href") if a else None)
title = clean_ws(a.get_text(" ", strip=True) if a else "")
if not title or not href:
continue
source_el = article.select_one("a[data-n-tid], div[role=heading] a, span")
source = clean_ws(source_el.get_text(" ", strip=True) if source_el else "") or None
time_el = article.select_one("time")
published_at = None
if time_el:
raw = time_el.get("datetime") or time_el.get_text(" ", strip=True)
try:
published_at = date_parser.parse(raw).isoformat()
except Exception:
published_at = None
out.append({"title": title, "url": href, "source": source, "published_at": published_at})
return out
Step 4: Dedupe and export a daily feed
import json
import os
import re
from datetime import datetime, timezone
def norm_title(t: str) -> str:
return re.sub(r"\\W+", "", (t or "").lower())
def dedupe(items: list[dict]) -> list[dict]:
seen = set()
out = []
for it in items:
key = it.get("url") or norm_title(it.get("title", ""))
if not key or key in seen:
continue
seen.add(key)
out.append(it)
return out
def main() -> None:
topics = ["AI agent automation", "India startup funding", "cybersecurity breach"]
proxiesapi_key = os.environ.get("PROXIESAPI_KEY")
all_items: list[dict] = []
for t in topics:
url = news_search_url(t)
html = fetch_html(url, proxiesapi_key=proxiesapi_key)
batch = parse_news_results(html)
for it in batch:
it["topic"] = t
all_items.extend(batch)
items = dedupe(all_items)
items.sort(key=lambda x: x.get("published_at") or "", reverse=True)
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
path = f"news-feed-{today}.json"
with open(path, "w", encoding="utf-8") as f:
json.dump(items, f, ensure_ascii=False, indent=2)
print("items:", len(items), "->", path)
if __name__ == "__main__":
main()
Practical advice
- Cache HTML while developing and keep concurrency low.
- Extract a stable key (URL is best) for deduplication.
- If you need canonical source URLs, resolve redirects later as a separate step.
Google News pages can throttle or vary by region. ProxiesAPI helps keep your fetch layer consistent when you run many topic queries on a schedule.