Scrape Product Reviews from Best Buy with Python (SKU + Ratings + Pagination)
Best Buy product pages are heavy, but the review data is accessible in a clean, paginated JSON endpoint once you know the SKU.
In this guide you’ll build a scraper that:
- takes a Best Buy product URL
- extracts the SKU (robustly)
- downloads review pages (JSON) with retries/backoff
- normalizes the fields you actually care about
- exports JSONL + CSV

Review crawls fail for boring reasons: timeouts, flaky responses, and throttles. ProxiesAPI fits cleanly into your fetch layer so retries and rotation stay a small change — not a rewrite.
What we’re scraping (data shape)
At a high level, you’ll do two fetches:
- Product page HTML → get the SKU (if your URL doesn’t already include it)
- UGC reviews JSON → paginate reviews for that SKU
The review endpoint is typically shaped like:
https://www.bestbuy.com/ugc/v2/reviews?sku=SKU&page=1&pageSize=20&sort=MOST_HELPFUL
This is not a “secret API” — it’s how Best Buy’s own frontend loads reviews.
Setup
python3 -m venv .venv
source .venv/bin/activate
pip install requests beautifulsoup4 lxml pandas
export PROXIESAPI_KEY="YOUR_KEY" # optional
We’ll use:
requestsfor HTTPBeautifulSoup(lxml)for reliable HTML parsing (SKU extraction fallback)pandasfor quick CSV export
Step 1: (Optional) ProxiesAPI fetch wrapper for HTML
If you only scrape a single product URL, you can fetch it directly.
If you scale to many SKUs (or you’re outside the U.S. and hit extra friction), keep your architecture clean:
fetch_html() → parse SKU → fetch reviews JSON → export
import os
import time
import random
import urllib.parse
import requests
PROXIESAPI_KEY = os.environ.get("PROXIESAPI_KEY", "")
TIMEOUT = (10, 40) # connect, read
session = requests.Session()
def proxiesapi_url(target_url: str) -> str:
if not PROXIESAPI_KEY:
raise RuntimeError("Set PROXIESAPI_KEY in your environment")
return (
"http://api.proxiesapi.com/?auth_key="
+ urllib.parse.quote(PROXIESAPI_KEY, safe="")
+ "&url="
+ urllib.parse.quote(target_url, safe="")
)
def fetch_html(url: str, *, use_proxiesapi: bool = True, max_retries: int = 4) -> str:
last_err = None
for attempt in range(1, max_retries + 1):
try:
final_url = proxiesapi_url(url) if (use_proxiesapi and PROXIESAPI_KEY) else url
r = session.get(
final_url,
timeout=TIMEOUT,
headers={
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/123.0 Safari/537.36"
),
"Accept-Language": "en-US,en;q=0.9",
},
)
r.raise_for_status()
html = r.text
if not html or len(html) < 2000:
raise RuntimeError(f"Suspiciously small HTML ({len(html)} bytes)")
return html
except Exception as e:
last_err = e
time.sleep(min(10, 2 ** (attempt - 1)) + random.random())
raise RuntimeError(f"HTML fetch failed after {max_retries} attempts: {last_err}")
Step 2: Extract the SKU robustly
Best Buy product URLs often contain skuId=1234567. If it’s present, use it.
If it isn’t present, fall back to parsing the HTML.
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs
def sku_from_url(url: str) -> str | None:
qs = parse_qs(urlparse(url).query)
if "skuId" in qs and qs["skuId"]:
return qs["skuId"][0]
m = re.search(r"/(\d+)\.p\b", url)
return m.group(1) if m else None
def sku_from_html(html: str) -> str | None:
# Common patterns in embedded JSON/config blobs.
m = re.search(r'"skuId"\s*:\s*(\d+)', html)
if m:
return m.group(1)
soup = BeautifulSoup(html, "lxml")
el = soup.select_one("[data-sku-id]")
if el and el.get("data-sku-id"):
return el["data-sku-id"]
return None
def get_sku(product_url: str, *, use_proxiesapi: bool = True) -> str:
sku = sku_from_url(product_url)
if sku:
return sku
html = fetch_html(product_url, use_proxiesapi=use_proxiesapi)
sku = sku_from_html(html)
if not sku:
raise RuntimeError("Could not find skuId in URL or HTML")
return sku
Step 3: Fetch reviews JSON with pagination + backoff
Now we’ll pull the reviews page-by-page.
Two rules keep this reliable:
- treat non-200s and empty payloads as retryable
- slow down between pages (Best Buy will throttle if you hammer it)
import json
from typing import Any
def fetch_reviews_page(sku: str, page: int, *, page_size: int = 20, max_retries: int = 4) -> dict[str, Any]:
url = (
"https://www.bestbuy.com/ugc/v2/reviews"
f"?sku={sku}&page={page}&pageSize={page_size}&sort=MOST_HELPFUL"
)
last_err = None
for attempt in range(1, max_retries + 1):
try:
r = session.get(
url,
timeout=TIMEOUT,
headers={
"User-Agent": "Mozilla/5.0 (compatible; ProxiesAPI-Guides/1.0)",
"Accept": "application/json,text/plain,*/*",
"Accept-Language": "en-US,en;q=0.9",
},
)
r.raise_for_status()
data = r.json()
if not isinstance(data, dict):
raise RuntimeError("Unexpected JSON type")
return data
except Exception as e:
last_err = e
time.sleep(min(10, 2 ** (attempt - 1)) + random.random())
raise RuntimeError(f"Reviews fetch failed (page {page}): {last_err}")
def iter_reviews(sku: str, *, max_pages: int = 50, page_size: int = 20) -> list[dict]:
all_reviews: list[dict] = []
for page in range(1, max_pages + 1):
data = fetch_reviews_page(sku, page, page_size=page_size)
# Typical payloads use keys like reviews / totalPages; keep this defensive.
reviews = data.get("reviews") or data.get("items") or []
if not reviews:
break
for r in reviews:
all_reviews.append(r)
time.sleep(0.6 + random.random())
total_pages = data.get("totalPages") or data.get("total_pages")
if isinstance(total_pages, int) and page >= total_pages:
break
return all_reviews
Step 4: Normalize review fields (what you actually export)
The raw JSON can be noisy. Normalize down to something stable:
- rating
- title / body
- author (when present)
- submission date
- helpful votes / verified buyer flags (when present)
from datetime import datetime
def norm_review(r: dict) -> dict:
def pick(*keys):
for k in keys:
if k in r and r[k] not in (None, ""):
return r[k]
return None
return {
"review_id": pick("id", "reviewId", "submissionId"),
"sku": pick("sku", "skuId"),
"rating": pick("rating", "ratingValue", "score"),
"title": pick("title", "headline"),
"body": pick("comment", "text", "reviewText", "body"),
"author": (pick("reviewer", "author") or {}).get("name") if isinstance(pick("reviewer", "author"), dict) else pick("reviewer", "author"),
"submitted_at": pick("submissionTime", "submissionDate", "submittedAt"),
"helpful_votes": pick("helpfulVotes", "helpfulCount"),
"verified_buyer": pick("verifiedBuyer", "isVerifiedBuyer"),
}
Step 5: Export JSONL + CSV
import pandas as pd
def export(reviews: list[dict], slug: str) -> None:
rows = [norm_review(r) for r in reviews]
jsonl_path = f"{slug}.jsonl"
with open(jsonl_path, "w", encoding="utf-8") as f:
for row in rows:
f.write(json.dumps(row, ensure_ascii=False) + "\n")
df = pd.DataFrame(rows)
csv_path = f"{slug}.csv"
df.to_csv(csv_path, index=False)
print("wrote", jsonl_path, len(rows))
print("wrote", csv_path, len(rows))
Full runnable script
Put it all together:
PRODUCT_URL = "PASTE_A_BESTBUY_PRODUCT_URL_HERE"
sku = get_sku(PRODUCT_URL, use_proxiesapi=True)
print("sku:", sku)
raw_reviews = iter_reviews(sku, max_pages=25, page_size=20)
print("raw reviews:", len(raw_reviews))
export(raw_reviews, slug=f"bestbuy_reviews_{sku}")
QA checklist (before you scale)
- SKU extraction works for 2–3 different product URLs
- Page 1 returns non-empty reviews
- Pagination increases total review count
- Exported CSV opens cleanly and has sane columns
- You’re sleeping between pages (avoid throttles)
Where ProxiesAPI fits (no hype)
ProxiesAPI doesn’t magically make every request succeed.
What it does give you is a consistent fetch layer (rotation + retry-friendly behavior) so your parsing logic stays clean while you scale.
If you keep your scraper architecture as fetch → parse → export, swapping in/out ProxiesAPI is a small change.
Review crawls fail for boring reasons: timeouts, flaky responses, and throttles. ProxiesAPI fits cleanly into your fetch layer so retries and rotation stay a small change — not a rewrite.