Scrape Shopee Reviews at Scale: Ratings, Review Text, and Product Metadata
Shopee product pages are JavaScript-heavy, which makes “parse the HTML” scraping fragile.
For review analytics (sentiment, defect clustering, star distribution), you usually want:
- product metadata (name, currency, sold count)
- review text + star ratings
- pagination with a predictable offset
- exports you can feed into notebooks / BI tools
In this guide we’ll build a Shopee review scraper that:
- extracts
shopid+itemidfrom a product URL - fetches product metadata (JSON endpoint)
- paginates reviews (JSON endpoint)
- exports
reviews.jsonl+reviews.csv

Ecommerce endpoints throttle hard and change often. ProxiesAPI gives you a stable fetch layer + retries so review pagination doesn’t collapse halfway through a job.
Important note (what this guide does and doesn’t do)
- This guide shows a technical pattern for fetching Shopee JSON endpoints and exporting data.
- You should only scrape pages/endpoints you’re allowed to access and do so responsibly (rate limits, terms, privacy).
Setup
python3 -m venv .venv
source .venv/bin/activate
pip install requests pandas
ProxiesAPI fetch helper (JSON-friendly)
import os
import time
import random
import urllib.parse
import requests
PROXIESAPI_KEY = os.environ.get("PROXIESAPI_KEY", "")
TIMEOUT = (10, 40)
session = requests.Session()
def proxiesapi_url(target_url: str) -> str:
if not PROXIESAPI_KEY:
raise RuntimeError("Set PROXIESAPI_KEY in your environment")
return (
"http://api.proxiesapi.com/?auth_key="
+ urllib.parse.quote(PROXIESAPI_KEY, safe="")
+ "&url="
+ urllib.parse.quote(target_url, safe="")
)
def fetch_json(url: str, *, use_proxiesapi: bool = True, max_retries: int = 5) -> dict:
last_err = None
for attempt in range(1, max_retries + 1):
try:
final_url = proxiesapi_url(url) if use_proxiesapi else url
r = session.get(
final_url,
timeout=TIMEOUT,
headers={
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/123.0 Safari/537.36"
),
"Accept": "application/json,text/plain,*/*",
"Accept-Language": "en-US,en;q=0.9",
},
)
r.raise_for_status()
data = r.json()
if not isinstance(data, dict):
raise RuntimeError("Unexpected JSON shape")
return data
except Exception as e:
last_err = e
sleep_s = min(12, (2 ** (attempt - 1))) + random.random()
time.sleep(sleep_s)
raise RuntimeError(f"JSON fetch failed after {max_retries} attempts: {last_err}")
Step 1: Get shopid and itemid from a Shopee URL
Most Shopee product URLs include a stable i.<shopid>.<itemid> segment.
Examples you’ll see in the wild:
https://shopee.sg/Product-Name-i.12345678.987654321https://shopee.ph/...-i.123.456
import re
from urllib.parse import urlparse
def parse_ids(product_url: str) -> tuple[int, int]:
u = urlparse(product_url)
m = re.search(r"i\.(\d+)\.(\d+)", u.path)
if not m:
raise ValueError("Could not parse shopid/itemid from URL (expected i.<shopid>.<itemid>)")
return int(m.group(1)), int(m.group(2))
If your URLs don’t include the i.<shopid>.<itemid> segment, a fallback is:
- fetch the product page HTML and look for embedded JSON containing
shopid/itemid
But start with the URL parsing approach—it’s fast and stable.
Step 2: Fetch product metadata (JSON)
Shopee commonly exposes a product endpoint like:
.../api/v4/item/get?shopid=...&itemid=...
The base domain depends on the locale (shopee.sg, shopee.ph, shopee.co.id, etc).
from urllib.parse import urlparse
def base_origin(product_url: str) -> str:
u = urlparse(product_url)
return f"{u.scheme}://{u.netloc}"
def get_product(product_url: str) -> dict:
origin = base_origin(product_url)
shopid, itemid = parse_ids(product_url)
url = f"{origin}/api/v4/item/get?shopid={shopid}&itemid={itemid}"
return fetch_json(url, use_proxiesapi=True)
Pick only the fields you need for your analysis:
def extract_product_fields(product_json: dict) -> dict:
item = (product_json or {}).get("data", {})
rating = item.get("item_rating", {}) or {}
rating_count = rating.get("rating_count")
rating_count_total = rating_count[0] if isinstance(rating_count, list) and rating_count else None
return {
"itemid": item.get("itemid"),
"shopid": item.get("shopid"),
"name": item.get("name"),
"currency": item.get("currency"),
"sold": item.get("sold"),
"historical_sold": item.get("historical_sold"),
"liked_count": item.get("liked_count"),
"rating_star": rating.get("rating_star"),
"rating_count_total": rating_count_total,
}
Step 3: Paginate reviews safely
Shopee review endpoints vary by locale and version. A common pattern is:
.../api/v2/item/get_ratings?shopid=...&itemid=...&limit=20&offset=0
We’ll:
- request
limit=20(you can try 50 if your locale supports it) - increment
offsetuntil we stop getting new reviews - dedupe by
cmtidwhen present
def build_ratings_url(origin: str, shopid: int, itemid: int, limit: int, offset: int) -> str:
return (
f"{origin}/api/v2/item/get_ratings"
f"?filter=0&flag=1&type=0&limit={limit}&offset={offset}"
f"&shopid={shopid}&itemid={itemid}"
)
def crawl_reviews(product_url: str, *, limit: int = 20, max_pages: int = 200) -> list[dict]:
origin = base_origin(product_url)
shopid, itemid = parse_ids(product_url)
reviews: list[dict] = []
seen: set[int] = set()
for page in range(max_pages):
offset = page * limit
url = build_ratings_url(origin, shopid, itemid, limit, offset)
data = fetch_json(url, use_proxiesapi=True)
items = (data or {}).get("data", {}).get("ratings") or []
if not items:
break
for r in items:
cmtid = r.get("cmtid")
if cmtid and cmtid in seen:
continue
if cmtid:
seen.add(cmtid)
reviews.append(
{
"shopid": shopid,
"itemid": itemid,
"cmtid": cmtid,
"userid": r.get("userid"),
"author_username": r.get("author_username"),
"rating_star": r.get("rating_star"),
"comment": r.get("comment"),
"tags": r.get("tags"),
"mtime": r.get("mtime"),
"like_count": r.get("like_count"),
"images": r.get("images"),
}
)
time.sleep(0.8 + random.random() * 0.6)
return reviews
Step 4: Export JSONL + CSV
import json
import pandas as pd
def export_reviews(reviews: list[dict], base: str = "shopee_reviews") -> tuple[str, str]:
jsonl_path = f"{base}.jsonl"
with open(jsonl_path, "w", encoding="utf-8") as f:
for r in reviews:
f.write(json.dumps(r, ensure_ascii=False) + "\n")
df = pd.DataFrame(reviews)
csv_path = f"{base}.csv"
df.to_csv(csv_path, index=False, encoding="utf-8")
return jsonl_path, csv_path
Full runnable example
if __name__ == "__main__":
product_url = os.environ.get("SHOPEE_PRODUCT_URL", "")
if not product_url:
raise RuntimeError("Set SHOPEE_PRODUCT_URL to a product page URL")
product_json = get_product(product_url)
product = extract_product_fields(product_json)
print("product:", product.get("name"))
reviews = crawl_reviews(product_url, limit=20, max_pages=60)
print("reviews:", len(reviews))
jsonl_path, csv_path = export_reviews(reviews, base="shopee_reviews")
print("wrote:", jsonl_path, csv_path)
Common failure modes (and fixes)
- Empty ratings array early: the endpoint params changed for that locale. Open DevTools on a product page, find the
get_ratingsrequest, and mirror its query params. - 429s: reduce concurrency, add jitter, and rely on retry/backoff.
- Locale differences: always build API URLs off the product URL’s origin (
shopee.sgvsshopee.ph).
QA checklist
-
parse_ids()returns two integers for your product URL -
get_product()returns JSON with a populateddatafield -
crawl_reviews()returns review text + star ratings (spot-check 5) - CSV opens cleanly and keeps Unicode review text intact
Ecommerce endpoints throttle hard and change often. ProxiesAPI gives you a stable fetch layer + retries so review pagination doesn’t collapse halfway through a job.