Scrape Google Maps Business Data with Python (Name, Rating, Address, Website)

May 02, 2026 · tutorial · #python, #google-maps, #local-business, #web-scraping, #requests, #beautifulsoup, #proxies, #lead-gen

Google Maps is the internet’s default “local database”. If you can turn it into a structured dataset, you can build:

local lead lists (with consent/compliance)
competitive analysis by category + city
store locators
directory enrichment

This guide shows an honest approach to extracting a few high-value fields:

business name
rating + review count
address
phone (when available)
website (when available)

A note on “scraping Google Maps”

Google Maps uses heavy client-side rendering and aggressive anti-bot defenses. There are two practical paths:

Use official APIs (Google Places API) — best reliability, costs money.
Scrape — feasible for small, careful workloads, but you must expect:
- HTML/JS changes
- intermittent blocks
- higher engineering cost

This tutorial focuses on place pages (a single business) because it’s the smallest unit you can reliably parse.

Scale Google Maps collection more reliably with ProxiesAPI

Google Maps is one of the most defended targets on the web. If you’re collecting many places, ProxiesAPI can help keep your fetch layer consistent with rotation and cleaner request hygiene—while you keep rates low and stay compliant.

Get 1,000 free API calls View pricing

What we’re scraping (two-step workflow)

Instead of trying to parse an infinite-scrolling search UI, we’ll do:

collect Google Maps place URLs (manually to start, or via a small search harvest)
fetch each place URL and extract fields from the page markup

A place URL looks like:

https://www.google.com/maps/place/...

In the page source you’ll often find structured data (including JSON-LD) that contains many fields we care about.

Disclaimer

Always respect the site’s terms and applicable laws.
Keep request rates low.
Don’t bypass access controls.

Setup

python -m venv .venv
source .venv/bin/activate
pip install requests beautifulsoup4 lxml tenacity python-dotenv

Create a .env:

PROXIESAPI_PROXY_URL="http://user:pass@gw.proxiesapi.com:10000"

Step 1: Build a fetch layer that won’t melt down

This pattern is the same as in most ProxiesAPI Guides posts:

timeouts
retries with jitter
optional proxy routing

import os
import random
import time
from dataclasses import dataclass
from typing import Optional

import requests
from dotenv import load_dotenv
from tenacity import retry, stop_after_attempt, wait_exponential_jitter, retry_if_exception_type

load_dotenv()

PROXIESAPI_PROXY_URL = os.getenv("PROXIESAPI_PROXY_URL")

TIMEOUT = (10, 30)

session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (compatible; ProxiesAPI-Guides/1.0; +https://proxiesapi.com)",
    "Accept-Language": "en-US,en;q=0.9",
})


@dataclass
class FetchConfig:
    proxiesapi_proxy_url: Optional[str] = None
    min_delay_s: float = 1.0
    max_delay_s: float = 2.5


class FetchError(RuntimeError):
    pass


def _proxies(cfg: FetchConfig):
    if not cfg.proxiesapi_proxy_url:
        return None
    return {"http": cfg.proxiesapi_proxy_url, "https": cfg.proxiesapi_proxy_url}


@retry(
    reraise=True,
    stop=stop_after_attempt(4),
    wait=wait_exponential_jitter(initial=2, max=20),
    retry=retry_if_exception_type((requests.RequestException, FetchError)),
)
def fetch_html(url: str, cfg: FetchConfig) -> str:
    time.sleep(random.uniform(cfg.min_delay_s, cfg.max_delay_s))

    r = session.get(url, timeout=TIMEOUT, proxies=_proxies(cfg))

    if r.status_code in (429, 503):
        raise FetchError(f"temporary block {r.status_code}")

    if r.status_code >= 500:
        raise FetchError(f"server error {r.status_code}")

    r.raise_for_status()
    return r.text

Step 2: Extract business fields from JSON-LD

Many place pages include JSON-LD (structured data) with a type like LocalBusiness.

We’ll parse <script type="application/ld+json"> blocks and extract the first one that looks like a business.

import json
import re
from bs4 import BeautifulSoup


def _looks_like_business(obj: dict) -> bool:
    t = obj.get("@type")
    if isinstance(t, list):
        t = " ".join([str(x) for x in t])
    return bool(t) and any(k in str(t).lower() for k in ["localbusiness", "restaurant", "store", "organization"])


def parse_jsonld_business(html: str) -> dict:
    soup = BeautifulSoup(html, "lxml")

    for script in soup.select('script[type="application/ld+json"]'):
        raw = script.get_text(" ", strip=True)
        if not raw:
            continue

        # Some pages embed multiple JSON objects/arrays.
        try:
            data = json.loads(raw)
        except json.JSONDecodeError:
            continue

        candidates = data if isinstance(data, list) else [data]
        for obj in candidates:
            if not isinstance(obj, dict):
                continue
            if not _looks_like_business(obj):
                continue

            addr = obj.get("address") or {}
            if isinstance(addr, dict):
                address = ", ".join([x for x in [
                    addr.get("streetAddress"),
                    addr.get("addressLocality"),
                    addr.get("addressRegion"),
                    addr.get("postalCode"),
                    addr.get("addressCountry"),
                ] if x])
            else:
                address = None

            agg = obj.get("aggregateRating") or {}
            rating = agg.get("ratingValue") if isinstance(agg, dict) else None
            review_count = agg.get("reviewCount") if isinstance(agg, dict) else None

            return {
                "name": obj.get("name"),
                "rating": rating,
                "review_count": review_count,
                "address": address,
                "telephone": obj.get("telephone"),
                "website": obj.get("url"),
            }

    return {}

Why JSON-LD?

Parsing visible HTML can be brittle when class names are obfuscated. JSON-LD is often more stable (when present).

But it’s not guaranteed on every place page—so we’ll add a fallback.

Step 3 (Fallback): Extract address/website from page text

Sometimes JSON-LD is missing fields or blocked. A lightweight fallback is to:

search for "Address" label patterns
look for outbound http(s) links

This is intentionally “best-effort”, not perfect.


def fallback_extract(html: str) -> dict:
    soup = BeautifulSoup(html, "lxml")
    text = soup.get_text("\n", strip=True)

    # website: first external link that is not google.
    website = None
    for a in soup.select("a[href]"):
        href = a.get("href")
        if not href:
            continue
        if href.startswith("http") and "google.com" not in href:
            website = href
            break

    address = None
    m = re.search(r"Address\s*(.+)", text)
    if m:
        address = m.group(1)[:200]

    return {"website": website, "address": address}

Step 4: Put it together (URL list → CSV)

Create places.txt:

https://www.google.com/maps/place/...
https://www.google.com/maps/place/...

Then run:

import csv
from pathlib import Path


def scrape_place(url: str, cfg: FetchConfig) -> dict:
    html = fetch_html(url, cfg)

    data = parse_jsonld_business(html)
    if not data:
        data = {}

    # Fill missing fields with fallback
    fb = fallback_extract(html)
    for k, v in fb.items():
        if not data.get(k) and v:
            data[k] = v

    data["url"] = url
    return data


if __name__ == "__main__":
    cfg = FetchConfig(proxiesapi_proxy_url=PROXIESAPI_PROXY_URL)

    urls = [
        u.strip()
        for u in Path("places.txt").read_text(encoding="utf-8").splitlines()
        if u.strip() and not u.strip().startswith("#")
    ]

    Path("out").mkdir(exist_ok=True)
    out_path = Path("out/google_maps_places.csv")

    with out_path.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=[
            "name", "rating", "review_count", "address", "telephone", "website", "url"
        ])
        w.writeheader()

        for i, url in enumerate(urls, start=1):
            try:
                row = scrape_place(url, cfg)
                w.writerow(row)
                print(f"[{i}/{len(urls)}] ok: {row.get('name')}")
            except Exception as e:
                print(f"[{i}/{len(urls)}] FAILED: {url} ({e})")

    print("wrote", out_path)