How to Scrape Google Search Results with Python (Without Getting Blocked)
Scraping Google Search results (SERPs) is one of the most requested scraping tasks — and one of the easiest to get wrong.
In this guide you’ll learn a practical, defensive approach to the keyword “how to scrape google search results with python”:
- how SERP HTML is structured (and why it shifts)
- how consent/interstitials show up
- how to parse organic results without relying on one brittle selector
- how to reduce blocks (backoff, pacing, caching, and proxies)
- how to export results to CSV/JSON
Note: Always review Google’s terms for your use case. If you need stable, high-volume SERP data, a dedicated SERP provider may be a better fit than DIY scraping.
Google SERPs change often and block aggressively. ProxiesAPI gives you proxy rotation and a more resilient request layer so your SERP workflow can be tested and operated with fewer sudden failures.
Why Google SERPs are hard
Google’s SERPs:
- vary by country, language, device, and logged-in state
- include many modules (ads, videos, “People also ask”, local pack)
- can show consent screens depending on region
- use anti-bot systems that trigger “unusual traffic” / CAPTCHA
So the only reliable approach is:
- fetch with retries and detection
- parse multiple candidate patterns
- validate outputs
- keep volume low and predictable
Setup
python -m venv .venv
source .venv/bin/activate
pip install requests beautifulsoup4 lxml
Step 1: A fetch() that detects blocks (and uses ProxiesAPI)
import os
import time
import random
from urllib.parse import urlencode
import requests
TIMEOUT = (10, 30)
MAX_RETRIES = 5
session = requests.Session()
DEFAULT_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
}
def build_proxiesapi_url(target_url: str) -> str:
api_key = os.environ.get("PROXIESAPI_KEY")
if not api_key:
raise RuntimeError("Missing PROXIESAPI_KEY env var")
base = os.environ.get("PROXIESAPI_URL", "https://api.proxiesapi.com")
qs = urlencode({"api_key": api_key, "url": target_url})
return f"{base}/?{qs}"
def looks_blocked(html: str) -> bool:
t = (html or "").lower()
# Common Google block/interstitial phrases
return any(s in t for s in [
"our systems have detected unusual traffic",
"to continue, please verify",
"sorry",
"/sorry/index",
"captcha",
])
def fetch(url: str, *, use_proxiesapi: bool = True) -> str:
for attempt in range(1, MAX_RETRIES + 1):
try:
final = build_proxiesapi_url(url) if use_proxiesapi else url
r = session.get(final, headers=DEFAULT_HEADERS, timeout=TIMEOUT)
if r.status_code in (429, 503):
raise RuntimeError(f"transient status {r.status_code}")
r.raise_for_status()
html = r.text or ""
if looks_blocked(html):
raise RuntimeError("blocked/interstitial detected")
return html
except Exception as e:
if attempt == MAX_RETRIES:
raise
sleep_s = min(30, 2 ** (attempt - 1)) + random.uniform(0, 0.7)
print(f"fetch failed ({attempt}/{MAX_RETRIES}): {e} — sleeping {sleep_s:.1f}s")
time.sleep(sleep_s)
Step 2: Build a search URL the right way
The simplest approach is a plain query parameter:
https://www.google.com/search?q=YOUR+QUERY&num=10&hl=en&gl=us&pws=0
Parameters:
num=10results per page (Google may cap)hl=enUI languagegl=usgeopws=0disable personalized search signals (best-effort)
from urllib.parse import quote_plus
def google_search_url(q: str, *, hl: str = "en", gl: str = "us", num: int = 10, start: int = 0) -> str:
return (
"https://www.google.com/search?"
f"q={quote_plus(q)}&num={num}&hl={hl}&gl={gl}&pws=0&start={start}"
)
url = google_search_url("site:github.com proxies")
print(url)
html = fetch(url)
print(len(html))
Step 3: Parse organic results (defensive)
SERPs contain many modules. We want organic results.
There is no single forever selector, but a common pattern is:
- container blocks under
#search - organic results often contain an
<a>with anhrefthat looks like a real URL
We’ll implement a parser that:
- searches within
#searchfirst - extracts candidate result links
- filters out internal Google links
- tries to capture title + snippet when available
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse
def is_google_internal(href: str) -> bool:
if not href:
return True
if href.startswith("/"):
return True
host = urlparse(href).netloc.lower()
return host.endswith("google.com") or host.endswith("googleusercontent.com")
def parse_serp(html: str) -> list[dict]:
soup = BeautifulSoup(html, "lxml")
scope = soup.select_one("div#search") or soup
results = []
seen = set()
# Candidate blocks
for block in scope.select("div"):
a = block.select_one("a[href]")
if not a:
continue
href = a.get("href")
if not href or is_google_internal(href):
continue
title_el = block.select_one("h3")
title = title_el.get_text(" ", strip=True) if title_el else a.get_text(" ", strip=True)
title = title.strip() if title else None
# Snippet: best-effort
snippet = None
# common snippet container candidates
sn = block.select_one("div.VwiC3b") or block.select_one("span.aCOpRe")
if sn:
snippet = sn.get_text(" ", strip=True)
if not title:
continue
key = href
if key in seen:
continue
seen.add(key)
results.append({
"title": title,
"url": href,
"snippet": snippet,
})
return results
items = parse_serp(html)
print("organic candidates:", len(items))
print(items[:3])
Important: validate output
Always sanity-check:
- Do you see real external URLs?
- Are titles plausible?
- Are you accidentally capturing nav links?
If not, open the HTML in a browser and adjust your selectors.
Step 4: Pagination (start=10,20,...)
Google uses start= offsets.
def crawl_serp(q: str, pages: int = 3) -> list[dict]:
all_items = []
seen = set()
for p in range(pages):
url = google_search_url(q, start=p * 10)
html = fetch(url)
batch = parse_serp(html)
new_count = 0
for it in batch:
if it["url"] in seen:
continue
seen.add(it["url"])
all_items.append(it)
new_count += 1
print(f"page {p+1}: batch={len(batch)} new={new_count} total={len(all_items)}")
# gentle pacing
time.sleep(2.0 + random.uniform(0, 0.8))
return all_items
rows = crawl_serp("proxies api", pages=2)
Step 5: Export to CSV/JSON
import csv
import json
def write_csv(path: str, rows: list[dict]) -> None:
fields = ["title", "url", "snippet"]
with open(path, "w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=fields)
w.writeheader()
for r in rows:
w.writerow({k: r.get(k) for k in fields})
def write_json(path: str, rows: list[dict]) -> None:
with open(path, "w", encoding="utf-8") as f:
json.dump(rows, f, ensure_ascii=False, indent=2)
write_csv("google_serp.csv", rows)
write_json("google_serp.json", rows)
print("wrote google_serp.csv + google_serp.json")
How to reduce blocks (what actually works)
-
Lower your volume
- Don’t scrape at high frequency; cache results.
-
Backoff on trouble
- 429/503 and interstitial detection should slow you down, not speed you up.
-
Rotate IPs
- A single IP doing repeated SERP fetches gets flagged quickly.
-
Keep requests consistent
- stable headers, stable parameters (
hl,gl,pws).
- stable headers, stable parameters (
-
Don’t parse every module
- Start with organic results; add PAA/local pack only if needed.
Where ProxiesAPI fits (honestly)
SERP scraping is exactly where a proxy layer helps:
- IP rotation reduces repeated requests from one origin
- it’s easier to run from a server without burning your own IP reputation
But proxies are not a magic wand. If you crawl too fast, you’ll still get blocked.
QA checklist
-
fetch()fails fast and retries with backoff - you detect common interstitial text
-
parse_serp()returns mostly external links - pagination increases unique results
- exports are clean
Next upgrades
- store results in SQLite and refresh only when stale
- parse “People also ask” questions (carefully)
- add per-keyword scheduling + quotas
Google SERPs change often and block aggressively. ProxiesAPI gives you proxy rotation and a more resilient request layer so your SERP workflow can be tested and operated with fewer sudden failures.