Google News Scraping: Build a Custom News Aggregator

May 28, 2026 · tutorial · #python, #google-news, #web-scraping, #requests, #beautifulsoup, #json

Google News is a strong input for a personal news radar, especially if you want your own topic definitions, deduplication, and a consistent daily export.

This tutorial shows a practical approach to scraping public Google News search results and turning them into a clean feed.

Note: scraping Google properties can be fragile and may be subject to rate limits and terms. Keep volume reasonable, cache while developing, and prefer official feeds when available.

If you scale topic queries, use ProxiesAPI for stability

Google News pages can throttle or vary by region. ProxiesAPI helps keep your fetch layer consistent when you run many topic queries on a schedule.

Get 1,000 free API calls View pricing

What we are scraping

We use the Google News search URL:

https://news.google.com/search?q=YOUR_TOPIC&hl=en-US&gl=US&ceid=US:en

The page contains multiple article cards. Inside each card you can usually find:

headline link
publisher
time

Our pipeline:

build a topic URL
fetch HTML (direct first, ProxiesAPI optional)
parse cards into structured records
dedupe and export

Setup

python3 -m venv .venv
source .venv/bin/activate
pip install requests beautifulsoup4 lxml python-dateutil

Step 1: Fetch HTML (direct and ProxiesAPI option)

import random
import time
from urllib.parse import quote_plus

import requests

TIMEOUT = (10, 60)


def proxiesapi_url(target_url: str, api_key: str) -> str:
    return f"http://api.proxiesapi.com/?key={quote_plus(api_key)}&url={quote_plus(target_url)}"


def fetch_html(url: str, *, proxiesapi_key: str | None = None, max_attempts: int = 6) -> str:
    session = requests.Session()
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/124.0.0.0 Safari/537.36"
        ),
        "Accept-Language": "en-US,en;q=0.9",
    }

    last_err = None
    for attempt in range(1, max_attempts + 1):
        try:
            target = proxiesapi_url(url, proxiesapi_key) if proxiesapi_key else url
            r = session.get(target, headers=headers, timeout=TIMEOUT)
            if r.status_code >= 400:
                raise requests.HTTPError(f"HTTP {r.status_code}")
            return r.text or ""
        except Exception as e:
            last_err = e
            time.sleep(min(40, 2 ** attempt) + random.random())

    raise RuntimeError(f"fetch failed: {last_err}")

Step 2: Build a topic search URL

from urllib.parse import urlencode


def news_search_url(topic: str, *, hl: str = "en-US", gl: str = "US", ceid: str = "US:en") -> str:
    base = "https://news.google.com/search"
    qs = urlencode({"q": topic, "hl": hl, "gl": gl, "ceid": ceid})
    return f"{base}?{qs}"

Step 3: Parse article cards (selectors plus normalization)

Google News markup changes, so we target article elements and then look for a headline link inside common heading tags.

import re
from urllib.parse import urljoin

from bs4 import BeautifulSoup
from dateutil import parser as date_parser

BASE = "https://news.google.com"


def clean_ws(s: str) -> str:
    return re.sub(r"\\s+", " ", (s or "").strip())


def normalize_href(href: str) -> str:
    if not href:
        return href
    if href.startswith("./"):
        return urljoin(BASE + "/", href[2:])
    if href.startswith("/"):
        return urljoin(BASE, href)
    return href


def parse_news_results(html: str) -> list[dict]:
    soup = BeautifulSoup(html, "lxml")
    out: list[dict] = []

    for article in soup.select("article"):
        a = article.select_one("h3 a, h4 a, a[href^=\"./articles/\"], a[href*=\"/articles/\"]")
        href = normalize_href(a.get("href") if a else None)
        title = clean_ws(a.get_text(" ", strip=True) if a else "")
        if not title or not href:
            continue

        source_el = article.select_one("a[data-n-tid], div[role=heading] a, span")
        source = clean_ws(source_el.get_text(" ", strip=True) if source_el else "") or None

        time_el = article.select_one("time")
        published_at = None
        if time_el:
            raw = time_el.get("datetime") or time_el.get_text(" ", strip=True)
            try:
                published_at = date_parser.parse(raw).isoformat()
            except Exception:
                published_at = None

        out.append({"title": title, "url": href, "source": source, "published_at": published_at})

    return out

Step 4: Dedupe and export a daily feed

import json
import os
import re
from datetime import datetime, timezone


def norm_title(t: str) -> str:
    return re.sub(r"\\W+", "", (t or "").lower())


def dedupe(items: list[dict]) -> list[dict]:
    seen = set()
    out = []
    for it in items:
        key = it.get("url") or norm_title(it.get("title", ""))
        if not key or key in seen:
            continue
        seen.add(key)
        out.append(it)
    return out


def main() -> None:
    topics = ["AI agent automation", "India startup funding", "cybersecurity breach"]
    proxiesapi_key = os.environ.get("PROXIESAPI_KEY")

    all_items: list[dict] = []
    for t in topics:
        url = news_search_url(t)
        html = fetch_html(url, proxiesapi_key=proxiesapi_key)
        batch = parse_news_results(html)
        for it in batch:
            it["topic"] = t
        all_items.extend(batch)

    items = dedupe(all_items)
    items.sort(key=lambda x: x.get("published_at") or "", reverse=True)

    today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
    path = f"news-feed-{today}.json"
    with open(path, "w", encoding="utf-8") as f:
        json.dump(items, f, ensure_ascii=False, indent=2)

    print("items:", len(items), "->", path)


if __name__ == "__main__":
    main()