Scrape Recipe Data from AllRecipes with Python (Ingredients + Steps + Nutrition)
AllRecipes recipe pages usually contain a goldmine: a JSON-LD “Recipe” object with ingredients, steps, and nutrition.
So instead of guessing brittle CSS selectors, we’ll:
- fetch the HTML
- find
<script type="application/ld+json">blocks - parse the Recipe object
- normalize it into clean rows you can store/search

Recipes look simple, but large crawls fail on timeouts, flaky pages, and throttles. ProxiesAPI belongs in the fetch layer so your JSON-LD parsing stays the same while you scale.
Setup
python3 -m venv .venv
source .venv/bin/activate
pip install requests beautifulsoup4 lxml pandas
export PROXIESAPI_KEY="YOUR_KEY" # optional
Step 1: A clean fetch layer (optional ProxiesAPI)
Keep your scraper architecture clean:
fetch HTML → parse JSON-LD → export
import os
import time
import random
import urllib.parse
import requests
PROXIESAPI_KEY = os.environ.get("PROXIESAPI_KEY", "")
TIMEOUT = (10, 40) # connect, read
session = requests.Session()
def proxiesapi_url(target_url: str) -> str:
if not PROXIESAPI_KEY:
raise RuntimeError("Set PROXIESAPI_KEY in your environment")
return (
"http://api.proxiesapi.com/?auth_key="
+ urllib.parse.quote(PROXIESAPI_KEY, safe="")
+ "&url="
+ urllib.parse.quote(target_url, safe="")
)
def fetch_html(url: str, *, use_proxiesapi: bool = True, max_retries: int = 4) -> str:
last_err = None
for attempt in range(1, max_retries + 1):
try:
final_url = proxiesapi_url(url) if (use_proxiesapi and PROXIESAPI_KEY) else url
r = session.get(
final_url,
timeout=TIMEOUT,
headers={
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/123.0 Safari/537.36"
),
"Accept-Language": "en-US,en;q=0.9",
},
)
r.raise_for_status()
html = r.text
if not html or len(html) < 2000:
raise RuntimeError(f"Suspiciously small HTML ({len(html)} bytes)")
return html
except Exception as e:
last_err = e
time.sleep(min(10, 2 ** (attempt - 1)) + random.random())
raise RuntimeError(f"HTML fetch failed after {max_retries} attempts: {last_err}")
Step 2: Parse the Recipe JSON-LD
AllRecipes often includes multiple JSON-LD blocks (BreadcrumbList, WebPage, Recipe). We’ll scan for the one that contains a Recipe object.
import json
from bs4 import BeautifulSoup
def iter_json_ld(html: str) -> list[dict]:
soup = BeautifulSoup(html, "lxml")
blocks = soup.select('script[type="application/ld+json"]')
out: list[dict] = []
for b in blocks:
raw = (b.string or "").strip()
if not raw:
continue
try:
data = json.loads(raw)
except Exception:
continue
if isinstance(data, dict):
out.append(data)
elif isinstance(data, list):
out.extend([x for x in data if isinstance(x, dict)])
return out
def find_recipe_obj(objs: list[dict]) -> dict | None:
def is_recipe(d: dict) -> bool:
t = d.get("@type")
if isinstance(t, list):
return any(x == "Recipe" for x in t)
return t == "Recipe"
# Sometimes the recipe is nested under @graph.
for o in objs:
if is_recipe(o):
return o
g = o.get("@graph")
if isinstance(g, list):
for node in g:
if isinstance(node, dict) and is_recipe(node):
return node
return None
Step 3: Normalize fields (ingredients, steps, nutrition)
Recipe JSON-LD can vary slightly. Normalize defensively.
from typing import Any
def normalize_instructions(val: Any) -> list[str]:
if not val:
return []
if isinstance(val, str):
return [val.strip()]
if isinstance(val, list):
steps: list[str] = []
for x in val:
if isinstance(x, str):
steps.append(x.strip())
elif isinstance(x, dict):
# HowToStep / HowToSection
if "text" in x and isinstance(x["text"], str):
steps.append(x["text"].strip())
elif "itemListElement" in x and isinstance(x["itemListElement"], list):
for y in x["itemListElement"]:
if isinstance(y, dict) and isinstance(y.get("text"), str):
steps.append(y["text"].strip())
return [s for s in steps if s]
if isinstance(val, dict) and isinstance(val.get("text"), str):
return [val["text"].strip()]
return []
def norm_recipe(recipe: dict) -> dict:
n = recipe.get("nutrition") if isinstance(recipe.get("nutrition"), dict) else {}
return {
"name": recipe.get("name"),
"description": recipe.get("description"),
"url": recipe.get("url"),
"author": (recipe.get("author") or {}).get("name") if isinstance(recipe.get("author"), dict) else recipe.get("author"),
"prep_time": recipe.get("prepTime"),
"cook_time": recipe.get("cookTime"),
"total_time": recipe.get("totalTime"),
"recipe_yield": recipe.get("recipeYield"),
"category": recipe.get("recipeCategory"),
"cuisine": recipe.get("recipeCuisine"),
"ingredients": recipe.get("recipeIngredient") or [],
"instructions": normalize_instructions(recipe.get("recipeInstructions")),
"calories": n.get("calories"),
"fat": n.get("fatContent"),
"carbs": n.get("carbohydrateContent"),
"protein": n.get("proteinContent"),
"rating_value": (recipe.get("aggregateRating") or {}).get("ratingValue") if isinstance(recipe.get("aggregateRating"), dict) else None,
"rating_count": (recipe.get("aggregateRating") or {}).get("ratingCount") if isinstance(recipe.get("aggregateRating"), dict) else None,
}
Step 4: Crawl a list of recipe URLs and export
You can start with a hand-picked list of URLs. Later, add a discovery step (search/category pages) to build your URL queue.
import pandas as pd
def scrape_recipe(url: str, *, use_proxiesapi: bool = True) -> dict:
html = fetch_html(url, use_proxiesapi=use_proxiesapi)
objs = iter_json_ld(html)
recipe = find_recipe_obj(objs)
if not recipe:
raise RuntimeError("Recipe JSON-LD not found")
row = norm_recipe(recipe)
row["source_url"] = url
return row
def export(recipes: list[dict], slug: str) -> None:
df = pd.DataFrame(recipes)
df.to_json(f"{slug}.json", orient="records", indent=2, force_ascii=False)
df.to_csv(f"{slug}.csv", index=False)
print("wrote", f"{slug}.json", len(df))
print("wrote", f"{slug}.csv", len(df))
Example run:
URLS = [
"PASTE_ALLRECIPES_RECIPE_URL_1",
"PASTE_ALLRECIPES_RECIPE_URL_2",
]
rows = []
for u in URLS:
rows.append(scrape_recipe(u, use_proxiesapi=True))
export(rows, slug="allrecipes_recipes")
QA checklist
-
recipeIngredientis non-empty for 3–5 sample URLs - Instructions are parsed as a clean list of steps
- Nutrition fields are present (when the page provides them)
- Exported CSV opens cleanly and values look sane
Where ProxiesAPI fits (no hype)
AllRecipes is often parseable without proxies for small runs.
ProxiesAPI matters when you scale: lots of URLs, more failures, more retries. Keeping it in the fetch layer means your JSON-LD parsing logic stays unchanged.
Recipes look simple, but large crawls fail on timeouts, flaky pages, and throttles. ProxiesAPI belongs in the fetch layer so your JSON-LD parsing stays the same while you scale.