How to Scrape Walmart Grocery Prices with Python (Search + Product Pages)
Walmart grocery data is useful for:
- building a price tracker (your pantry basket, local store comparisons)
- competitive price monitoring
- demand research (availability + substitutions)
In this guide you’ll build a Python scraper that can:
- Search for grocery items on Walmart
- Extract product cards (name, link, current price when shown)
- Visit each product page and extract:
- title
- current price
- unit/size (when available)
- availability / delivery badges (best-effort)
- Export the result to JSON
We’ll also include a robust fetch layer with retries and optional ProxiesAPI configuration.

Walmart pages are high-traffic and change often. ProxiesAPI helps you reduce blocks and keep retries + rotation consistent when you scale from a few SKUs to thousands.
Important reality check: Walmart is JS-heavy
Many Walmart pages are rendered by JavaScript and/or backed by JSON endpoints.
So you have two approaches:
- Approach A (HTML scraping): Works when the HTML includes the data or when meta tags/JSON-LD are present.
- Approach B (JSON endpoints): Often more stable for prices/variants.
In this tutorial, we’ll implement both:
- Try to parse JSON-LD (
<script type="application/ld+json">). - Fall back to HTML selectors.
If you find HTML is too thin (no product cards), the JSON-endpoint approach is usually the next step.
Setup
python -m venv .venv
source .venv/bin/activate
pip install requests beautifulsoup4 lxml tenacity
Step 1: Fetch layer (timeouts + retries + ProxiesAPI)
import os
import random
import requests
from tenacity import retry, stop_after_attempt, wait_exponential_jitter
TIMEOUT = (10, 40)
UA_POOL = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
]
session = requests.Session()
def build_proxies():
proxy_url = os.getenv("PROXIESAPI_PROXY_URL")
if not proxy_url:
return None
return {"http": proxy_url, "https": proxy_url}
@retry(stop=stop_after_attempt(6), wait=wait_exponential_jitter(initial=1, max=20))
def get(url: str) -> str:
headers = {
"User-Agent": random.choice(UA_POOL),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Cache-Control": "no-cache",
"Pragma": "no-cache",
}
r = session.get(
url,
headers=headers,
timeout=TIMEOUT,
proxies=build_proxies(),
allow_redirects=True,
)
# Common "soft blocks"
if r.status_code in (403, 429, 503):
raise RuntimeError(f"Blocked/rate limited: {r.status_code}")
r.raise_for_status()
return r.text
Step 2: Search Walmart for grocery items
Walmart search URLs typically look like:
https://www.walmart.com/search?q=milk
We’ll fetch the HTML and extract product links.
import re
from urllib.parse import quote
from bs4 import BeautifulSoup
BASE = "https://www.walmart.com"
def clean(x: str) -> str:
return re.sub(r"\s+", " ", (x or "").strip())
def abs_url(href: str) -> str:
return href if href.startswith("http") else f"{BASE}{href}"
def build_search_url(query: str) -> str:
return f"{BASE}/search?q={quote(query)}"
def parse_search_results(html: str, limit: int = 10) -> list[dict]:
soup = BeautifulSoup(html, "lxml")
items: list[dict] = []
# Walmart markup changes; a stable trick is: product links often contain "/ip/".
for a in soup.select("a[href*='/ip/']"):
href = a.get("href")
if not href:
continue
url = abs_url(href.split("?")[0])
name = clean(a.get_text(" ", strip=True))
if not name or len(name) < 3:
continue
items.append({"name": name, "url": url})
if len(items) >= limit:
break
# De-duplicate by URL
dedup = {}
for it in items:
dedup[it["url"]] = it
return list(dedup.values())
if __name__ == "__main__":
html = get(build_search_url("oats"))
results = parse_search_results(html, limit=10)
print("results:", len(results))
print(results[:2])
If you get too few results, it’s likely because the page is rendering product data via JS. In that case you should:
- use the product JSON endpoints (next section)
- or switch to a headless browser (Playwright)
Step 3: Extract price/size from a product page
Walmart product pages often embed structured data in JSON-LD, which is much easier than scraping brittle HTML.
We’ll implement:
- Try JSON-LD → parse
offers.price,name, etc. - Fall back to HTML selectors (best-effort)
import json
from bs4 import BeautifulSoup
def parse_jsonld_product(soup: BeautifulSoup) -> dict | None:
for s in soup.select('script[type="application/ld+json"]'):
raw = s.string
if not raw:
continue
try:
data = json.loads(raw)
except Exception:
continue
# JSON-LD may be an object or list
candidates = data if isinstance(data, list) else [data]
for obj in candidates:
if not isinstance(obj, dict):
continue
if obj.get("@type") in ("Product", "ProductGroup"):
return obj
return None
def parse_product_page(html: str, url: str) -> dict:
soup = BeautifulSoup(html, "lxml")
out = {"url": url}
j = parse_jsonld_product(soup)
if j:
out["title"] = j.get("name")
offers = j.get("offers")
if isinstance(offers, dict):
out["price"] = offers.get("price")
out["currency"] = offers.get("priceCurrency")
out["availability"] = offers.get("availability")
# Fallbacks
if not out.get("title"):
h1 = soup.select_one("h1")
if h1:
out["title"] = h1.get_text(" ", strip=True)
if not out.get("price"):
# Price sometimes appears in meta tags
meta = soup.select_one('meta[property="product:price:amount"], meta[itemprop="price"]')
if meta and meta.get("content"):
out["price"] = meta.get("content")
return out
if __name__ == "__main__":
url = "https://www.walmart.com/ip/Great-Value-Old-Fashioned-Oats-42-oz/10450985"
html = get(url)
product = parse_product_page(html, url)
print(product)
Step 4: End-to-end crawl (search → top N products → details)
import time
def crawl(query: str, limit: int = 10, sleep_s: float = 1.0) -> list[dict]:
search_url = build_search_url(query)
html = get(search_url)
cards = parse_search_results(html, limit=limit)
out = []
for i, c in enumerate(cards, start=1):
u = c["url"]
try:
ph = get(u)
p = parse_product_page(ph, u)
p["query"] = query
out.append(p)
print(f"[{i}/{len(cards)}] ok", u)
except Exception as e:
print(f"[{i}/{len(cards)}] fail", u, e)
time.sleep(sleep_s)
return out
if __name__ == "__main__":
rows = crawl("milk", limit=8)
print("rows:", len(rows))
Export JSON
import json
def dump_json(path: str, rows: list[dict]) -> None:
with open(path, "w", encoding="utf-8") as f:
json.dump(rows, f, ensure_ascii=False, indent=2)
# rows = crawl("oats", limit=10)
# dump_json("walmart_grocery.json", rows)
Practical anti-block checklist
- Use timeouts + retries (already done)
- Add a small delay between requests (1–2s)
- Rotate IPs when you scale (ProxiesAPI)
- Keep headers consistent (UA, language)
- Cache product pages you’ve already crawled
Where ProxiesAPI fits (honestly)
For Walmart, the failure modes usually show up once you:
- crawl many queries (hundreds of searches)
- revisit products frequently
- run from a single IP / data center
ProxiesAPI helps you reduce repetitive traffic signatures by rotating IPs and gives you a centralized, consistent proxy config.
It won’t replace:
- good parsing strategy (JSON-LD first)
- a plan for JS-heavy pages (API endpoints or headless browser)
QA checklist
- Search returns some
/ip/links - Product page includes JSON-LD for at least some items
- Price extracted is non-empty for multiple products
- Export JSON loads cleanly
Walmart pages are high-traffic and change often. ProxiesAPI helps you reduce blocks and keep retries + rotation consistent when you scale from a few SKUs to thousands.