How to Scrape Google Finance Data with Python (Quotes, News, and Historical Prices)
If you’ve ever tried to “just scrape” financial data, you already know the trap:
- the first 20 requests work
- then you hit rate limits, weird interstitials, or incomplete pages
Google Finance is a great source for quick snapshots:
- current price and daily change
- key metrics (market cap, P/E, etc.)
- recent news cards
- a chart UI that implies historical data
In this guide you’ll build a practical scraper that:
- Fetches a Google Finance quote page.
- Extracts price + change and a few key fields.
- Pulls news headlines.
- Exports to CSV.
We’ll do it with plain Python (requests + BeautifulSoup) and write the parser in a way that’s easy to maintain.
Google properties can be sensitive to request bursts and IP reputation. ProxiesAPI helps stabilize your pipeline by rotating IPs and reducing block rates when you scrape many tickers across markets.
What we’re scraping (example)
Google Finance quote pages look like:
https://www.google.com/finance/quote/GOOG:NASDAQhttps://www.google.com/finance/quote/RELIANCE:NSE
The page is server-rendered enough that you can parse meaningful HTML without a browser.
Setup
python -m venv .venv
source .venv/bin/activate
pip install requests beautifulsoup4 lxml pandas
Step 1: Fetch with timeouts + retry/backoff
For Google properties, the “boring” stuff matters most:
- timeouts (so you don’t hang)
- retry/backoff (so you don’t spam)
- a consistent User-Agent
import random
import time
from typing import Optional
import requests
TIMEOUT = (10, 30)
session = requests.Session()
session.headers.update({
"User-Agent": (
"Mozilla/5.0 (compatible; ProxiesAPI-GuidesBot/1.0; +https://proxiesapi.com)"
),
"Accept-Language": "en-US,en;q=0.9",
})
def fetch(url: str, *, max_retries: int = 5) -> str:
last_exc: Optional[Exception] = None
for attempt in range(1, max_retries + 1):
try:
r = session.get(url, timeout=TIMEOUT)
if r.status_code in (429, 500, 502, 503):
sleep = min(30, (2 ** attempt)) + random.random()
print(f"HTTP {r.status_code} — backoff {sleep:.1f}s")
time.sleep(sleep)
continue
r.raise_for_status()
# Polite jitter
time.sleep(0.8 + random.random())
return r.text
except Exception as e:
last_exc = e
sleep = min(30, (2 ** attempt)) + random.random()
print(f"error {type(e).__name__} — backoff {sleep:.1f}s")
time.sleep(sleep)
raise RuntimeError("failed to fetch") from last_exc
Step 2: Parse price + daily change
Google Finance markup changes, but the quote page usually contains a top “hero” region with:
- current price
- daily change (absolute and percent)
We’ll parse conservatively:
- look for a numeric price candidate near the top
- fall back to scanning a limited portion of the DOM
import re
from bs4 import BeautifulSoup
def clean_text(s: str | None) -> str | None:
if not s:
return None
s = re.sub(r"\s+", " ", s).strip()
return s or None
def parse_float(text: str | None) -> float | None:
if not text:
return None
m = re.search(r"([-+]?[0-9][0-9,]*\.?[0-9]*)", text)
if not m:
return None
try:
return float(m.group(1).replace(",", ""))
except ValueError:
return None
def parse_quote_page(html: str) -> dict:
soup = BeautifulSoup(html, "lxml")
# Title: often in <title> like "Alphabet Inc. Class C (GOOG) Stock Price..."
title = clean_text(soup.title.get_text(" ", strip=True) if soup.title else None)
text = soup.get_text("\n", strip=True)
# Heuristic: find the first plausible price after "Price" or near ticker
# (This is intentionally conservative; tune for your needs.)
price = None
# A more structured attempt: look for elements with aria-label containing "Price".
price_el = soup.select_one("[aria-label*='Price']")
if price_el:
price = parse_float(price_el.get_text(" ", strip=True))
# Fallback: scan limited text for a big number pattern
if price is None:
m = re.search(r"\n([0-9][0-9,]*\.?[0-9]{0,4})\n", "\n" + text + "\n")
if m:
price = parse_float(m.group(1))
# Change: look for a percent like +1.23%
change_pct = None
m = re.search(r"([-+]?\d+\.?\d*)%", text)
if m:
change_pct = parse_float(m.group(1))
return {
"title": title,
"price": price,
"change_pct": change_pct,
}
This won’t be perfect across every locale/market, but it’s a solid starting point.
Step 3: Parse key-value “stats” rows
Google Finance quote pages often include rows like:
- Market cap
- P/E ratio
- Dividend yield
These tend to appear as key/value pairs in repeated blocks.
We’ll extract any blocks that look like “Label → Value”.
def parse_stats(html: str) -> dict:
soup = BeautifulSoup(html, "lxml")
stats = {}
# Generic approach: look for all elements that resemble two-column rows.
# In many builds, labels are short and values are adjacent.
for row in soup.select("div"):
# keep it cheap: only consider small rows
txt = row.get_text("\n", strip=True)
if not txt or len(txt) > 80:
continue
parts = [p.strip() for p in txt.split("\n") if p.strip()]
if len(parts) == 2:
k, v = parts
if 2 <= len(k) <= 30 and len(v) <= 40:
# avoid obvious noise
if k.lower() in ("price", "open", "high", "low"):
continue
stats[k] = v
return stats
This is intentionally generic. If you want a “best possible” extractor, you’ll lock the exact DOM structure you see and target stable attributes.
Step 4: Parse news headlines
Quote pages include “News” cards. We’ll grab headline + publisher + link where possible.
from urllib.parse import urljoin
BASE = "https://www.google.com"
def parse_news(html: str, limit: int = 10) -> list[dict]:
soup = BeautifulSoup(html, "lxml")
news = []
# Headline-like links
for a in soup.select("a[href]"):
href = a.get("href", "")
text = clean_text(a.get_text(" ", strip=True))
if not text or len(text) < 20:
continue
# Heuristic: news links often go to /finance/news or external via /url?
if "/finance" not in href and "/url?" not in href:
continue
url = urljoin(BASE, href)
news.append({"headline": text, "url": url})
if len(news) >= limit:
break
return news
Step 5: Add a simple “historical” series (pragmatic)
Google Finance charts can be tricky to reverse engineer reliably.
A practical approach for many projects is:
- scrape current snapshot from Google Finance
- use a dedicated market data provider for full historical candles
But if you just need “some recent points” for lightweight analytics, you can:
- scrape the page and look for embedded JSON-like sequences
Here’s a conservative helper that tries to find a list of numbers that looks like a series. It’s not guaranteed (because the page can change), but it’s a starting point.
def try_extract_series(html: str, max_points: int = 200) -> list[float]:
# Very heuristic: find a chunk with many numbers separated by commas
m = re.search(r"(\[(?:-?\d+(?:\.\d+)?,){20,}-?\d+(?:\.\d+)?\])", html)
if not m:
return []
arr_text = m.group(1)
nums = re.findall(r"-?\d+(?:\.\d+)?", arr_text)
series = []
for n in nums[:max_points]:
try:
series.append(float(n))
except ValueError:
pass
return series
Again: useful for experimentation, not for mission-critical trading.
Full example: scrape a ticker and export CSV
import json
import pandas as pd
def scrape_google_finance(ticker: str) -> dict:
url = f"https://www.google.com/finance/quote/{ticker}"
html = fetch(url)
quote = parse_quote_page(html)
stats = parse_stats(html)
news = parse_news(html)
series = try_extract_series(html)
return {
"ticker": ticker,
"url": url,
"quote": quote,
"stats": stats,
"news": news,
"series": series,
}
def main():
data = scrape_google_finance("GOOG:NASDAQ")
# Write JSON
with open("google_finance_GOOG.json", "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
# Flatten a CSV row for quick analysis
row = {
"ticker": data["ticker"],
"price": data["quote"].get("price"),
"change_pct": data["quote"].get("change_pct"),
}
# include a handful of stats keys (optional)
for k, v in list(data["stats"].items())[:10]:
row[f"stat_{k}"] = v
df = pd.DataFrame([row])
df.to_csv("google_finance_snapshot.csv", index=False)
print("done")
if __name__ == "__main__":
main()
Block avoidance: what actually works
- Bound your request rate. Even 1 req/sec can be too much at scale.
- Cache responses. Don’t re-fetch the same ticker every minute.
- Retry with backoff (not tight loops).
- Rotate IPs when you scrape many tickers.
ProxiesAPI fits as the network layer when your scale grows from “a few tickers” to “many markets + many runs”.
FAQ
Is scraping Google Finance reliable for production trading?
No. For trading systems, use licensed market data vendors.
Can I get full OHLC candles from Google Finance?
Sometimes you can reverse engineer the chart payload, but it changes. Treat it as unstable.
Why not use Playwright?
You can — but the HTML here is often parseable without a browser, and that’s cheaper and simpler.
Google properties can be sensitive to request bursts and IP reputation. ProxiesAPI helps stabilize your pipeline by rotating IPs and reducing block rates when you scrape many tickers across markets.