How to Download Images from URLs with Python (fast, reliable, and deduped)
Downloading images from a list of URLs sounds easy… until you do it at scale.
You start running into:
- timeouts and flaky connections
- random HTML error pages saved as “.jpg”
- duplicate images under different URLs
- rate limits (429) and blocks (403)
This guide shows a production-grade approach to the keyword:
how to download images from urls with python
We’ll build a downloader that:
- streams images to disk (no huge RAM usage)
- validates content type + magic bytes
- uses retries with backoff
- downloads concurrently (ThreadPool)
- dedupes via SHA-256
- optionally routes through a ProxiesAPI proxy
Bulk image downloads trigger throttling fast. ProxiesAPI gives you a proxy endpoint you can plug into requests so your downloader keeps moving even when a host starts returning 429/403.
Setup
python -m venv .venv
source .venv/bin/activate
pip install requests tenacity
We’ll use:
requestsfor streaming downloadstenacityfor retry/backoff
Core design (what “reliable” means)
A reliable downloader does these things:
- Timeouts on every request
- Retries for transient failures (5xx, timeouts)
- Streaming writes (
iter_content) so big images don’t blow RAM - Validation so you don’t save HTML as an image
- Deterministic filenames so reruns don’t create chaos
- Dedupe so you don’t store the same image 10×
ProxiesAPI support (optional)
If a host starts rate limiting (429) or blocking (403), a proxy can help.
We’ll support a proxy via environment variable:
export PROXIESAPI_PROXY_URL="http://USER:PASS@gateway.proxiesapi.com:PORT"
The downloader (complete code)
import hashlib
import mimetypes
import os
import re
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urlparse
import requests
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
TIMEOUT = (10, 60)
PROXIESAPI_PROXY_URL = os.getenv("PROXIESAPI_PROXY_URL")
session = requests.Session()
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/123.0.0.0 Safari/537.36"
),
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
}
def proxies_dict():
if not PROXIESAPI_PROXY_URL:
return None
return {"http": PROXIESAPI_PROXY_URL, "https": PROXIESAPI_PROXY_URL}
def safe_filename(s: str, max_len: int = 120) -> str:
s = re.sub(r"[^a-zA-Z0-9._-]+", "-", s).strip("-")
return s[:max_len] if len(s) > max_len else s
def guess_ext(content_type: str | None, url: str) -> str:
if content_type:
ct = content_type.split(";")[0].strip().lower()
ext = mimetypes.guess_extension(ct) or ""
if ext in {".jpe", ".jpeg", ".jpg", ".png", ".webp", ".gif", ".bmp", ".tif", ".tiff"}:
return ".jpg" if ext == ".jpeg" else ext
# fallback: parse URL path
path = urlparse(url).path
_, ext = os.path.splitext(path)
ext = ext.lower()
if ext in {".jpg", ".jpeg", ".png", ".webp", ".gif"}:
return ".jpg" if ext == ".jpeg" else ext
return ".jpg"
def is_probably_image(content_type: str | None) -> bool:
if not content_type:
return False
return content_type.lower().startswith("image/")
def sha256_file(path: str) -> str:
h = hashlib.sha256()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(1024 * 1024), b""):
h.update(chunk)
return h.hexdigest()
_lock = threading.Lock()
@retry(
reraise=True,
stop=stop_after_attempt(5),
wait=wait_exponential(multiplier=1, min=2, max=20),
retry=retry_if_exception_type((requests.RequestException,)),
)
def download_one(url: str, out_dir: str) -> dict:
os.makedirs(out_dir, exist_ok=True)
r = session.get(
url,
headers=HEADERS,
timeout=TIMEOUT,
stream=True,
proxies=proxies_dict(),
)
# Treat rate limits / blocks as failures to retry
if r.status_code in (403, 429, 500, 502, 503, 504):
raise requests.RequestException(f"HTTP {r.status_code} for {url}")
r.raise_for_status()
content_type = r.headers.get("Content-Type", "")
if not is_probably_image(content_type):
# do not save HTML error pages
raise ValueError(f"Non-image content-type '{content_type}' for {url}")
ext = guess_ext(content_type, url)
parsed = urlparse(url)
base = safe_filename(os.path.basename(parsed.path) or "image")
if not base:
base = "image"
# Temporary file name (avoid partial collisions)
tmp_path = os.path.join(out_dir, f"{base}.partial{ext}")
h = hashlib.sha256()
size = 0
with open(tmp_path, "wb") as f:
for chunk in r.iter_content(chunk_size=1024 * 64):
if not chunk:
continue
f.write(chunk)
h.update(chunk)
size += len(chunk)
digest = h.hexdigest()
final_path = os.path.join(out_dir, f"{digest}{ext}")
# Atomic-ish move: if already downloaded, discard temp
with _lock:
if os.path.exists(final_path):
os.remove(tmp_path)
return {"url": url, "path": final_path, "bytes": size, "sha256": digest, "deduped": True}
os.replace(tmp_path, final_path)
return {"url": url, "path": final_path, "bytes": size, "sha256": digest, "deduped": False}
def download_all(urls: list[str], out_dir: str = "images", workers: int = 12) -> list[dict]:
results: list[dict] = []
with ThreadPoolExecutor(max_workers=workers) as ex:
futs = {ex.submit(download_one, u, out_dir): u for u in urls}
for fut in as_completed(futs):
url = futs[fut]
try:
res = fut.result()
results.append(res)
print("OK", url, res["path"], "deduped=" + str(res["deduped"]))
except Exception as e:
results.append({"url": url, "error": str(e)})
print("ERR", url, str(e))
return results
if __name__ == "__main__":
urls = [
# Replace with your list
"https://httpbin.org/image/jpeg",
"https://httpbin.org/image/png",
]
out = download_all(urls, out_dir="downloaded_images", workers=8)
ok = [r for r in out if "error" not in r]
print("done", "ok=", len(ok), "total=", len(out))
Notes on dedupe (why SHA-256 is better than filenames)
Two URLs can point to the same bytes:
- image CDNs with different query strings
- resized variants that are actually identical
- “same file uploaded twice” content
By hashing the bytes, you dedupe based on content, not based on the URL.
If you want to keep the original filename too, store a mapping CSV:
url,sha256,path
Practical advice: don’t get blocked while downloading
- keep workers modest (8–16)
- add jitter between requests if needed
- respect robots/legal constraints
- rotate IPs if you’re pulling large batches (ProxiesAPI)
Comparison: requests vs aiohttp
| Feature | requests + threads | aiohttp |
|---|---|---|
| Simplicity | Easy | Medium |
| Good enough for 10k images | Yes | Yes |
| HTTP/2 | No | Sometimes |
| Backpressure | Manual | Better |
If you want the simplest reliable solution: requests + ThreadPool is great.
Where ProxiesAPI fits (honestly)
Most image downloads fail for boring reasons:
- network hiccups
- host throttling
- inconsistent blocks
ProxiesAPI helps when a host starts treating your downloader like abuse.
It won’t fix broken URLs — but it can make a large batch download much more stable when you’re operating at scale.
Bulk image downloads trigger throttling fast. ProxiesAPI gives you a proxy endpoint you can plug into requests so your downloader keeps moving even when a host starts returning 429/403.