Scrape Stack Overflow Newest Questions into CSV with Python
Stack Overflow's newest questions feed is a practical source of developer-intent data. It tells you what engineers are stuck on right now, which frameworks are throwing fresh errors, and which tags are suddenly getting active.
In this tutorial we will build a Python scraper that:
- fetches the public newest-questions listing
- extracts question title, URL, votes, answers, views, tags, and asked time
- paginates across multiple pages
- exports the results to CSV
- keeps the network layer ready for ProxiesAPI if you scale the crawl

Newest-question feeds are easy to scrape in small batches. Once you monitor many tags or archive questions continuously, ProxiesAPI helps keep the fetch layer predictable without changing your parser.
Target page and selectors
The listing we want is:
https://stackoverflow.com/questions?tab=Newest
Stack Overflow still renders this page as readable HTML, which means requests + BeautifulSoup is enough.
At the time of writing, each question card is a div.s-post-summary. Useful selectors inside each card are:
h3 a.s-linkfor the title and detail URLspan.s-post-summary--stats-item-numberfor votes, answers, and viewsa.post-tagfor tagsspan.relativetimeortimefor the asked timestamp
That is exactly what we need for a lightweight monitoring dataset.
Setup
python3 -m venv .venv
source .venv/bin/activate
pip install requests beautifulsoup4 lxml
Optional:
export PROXIESAPI_KEY="YOUR_KEY"
Step 1: Build a polite HTTP client
You want real timeouts, a browser-like User-Agent, and exponential backoff around transient errors.
from __future__ import annotations
import os
import random
import time
from urllib.parse import quote
import requests
BASE = "https://stackoverflow.com"
PROXIESAPI_KEY = os.getenv("PROXIESAPI_KEY", "").strip()
TIMEOUT = (10, 30)
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/126.0.0.0 Safari/537.36"
),
"Accept-Language": "en-US,en;q=0.9",
}
class HttpClient:
def __init__(self) -> None:
self.session = requests.Session()
self.session.headers.update(HEADERS)
def _url(self, target_url: str, use_proxiesapi: bool) -> str:
if not use_proxiesapi or not PROXIESAPI_KEY:
return target_url
return (
"http://api.proxiesapi.com/?key="
f"{quote(PROXIESAPI_KEY, safe='')}&url={quote(target_url, safe='')}"
)
def get_html(self, target_url: str, *, use_proxiesapi: bool = False, retries: int = 4) -> str:
last_error = None
for attempt in range(1, retries + 1):
try:
response = self.session.get(
self._url(target_url, use_proxiesapi=use_proxiesapi),
timeout=TIMEOUT,
)
if response.status_code in (403, 429, 500, 502, 503, 504):
raise requests.HTTPError(
f"transient status {response.status_code}",
response=response,
)
response.raise_for_status()
return response.text
except Exception as exc:
last_error = exc
time.sleep(min(2 ** attempt, 8) + random.random())
raise RuntimeError(f"failed to fetch {target_url}: {last_error}")
Step 2: Parse one page of newest questions
We will convert each question card into a flat record that is easy to write to CSV.
from dataclasses import asdict, dataclass
import re
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def parse_number(text: str) -> int | None:
match = re.search(r"(\d[\d,]*)", text or "")
if not match:
return None
return int(match.group(1).replace(",", ""))
@dataclass
class QuestionRow:
title: str
url: str
votes: int | None
answers: int | None
views: int | None
asked_at: str | None
tags: str
def parse_listing(html: str) -> list[QuestionRow]:
soup = BeautifulSoup(html, "lxml")
rows: list[QuestionRow] = []
for card in soup.select("div.s-post-summary"):
title_link = card.select_one("h3 a.s-link")
if not title_link:
continue
title = title_link.get_text(" ", strip=True)
url = urljoin(BASE, title_link.get("href", ""))
stats = [el.get_text(" ", strip=True) for el in card.select("span.s-post-summary--stats-item-number")]
votes = parse_number(stats[0]) if len(stats) > 0 else None
answers = parse_number(stats[1]) if len(stats) > 1 else None
views = parse_number(stats[2]) if len(stats) > 2 else None
time_el = card.select_one("time") or card.select_one("span.relativetime")
asked_at = time_el.get("datetime") if time_el and time_el.has_attr("datetime") else (
time_el.get_text(" ", strip=True) if time_el else None
)
tags = [tag.get_text(strip=True) for tag in card.select("a.post-tag")]
rows.append(
QuestionRow(
title=title,
url=url,
votes=votes,
answers=answers,
views=views,
asked_at=asked_at,
tags="|".join(tags),
)
)
return rows
Quick test:
client = HttpClient()
html = client.get_html("https://stackoverflow.com/questions?tab=Newest&page=1")
questions = parse_listing(html)
print("questions:", len(questions))
print(asdict(questions[0]))
Step 3: Paginate across multiple pages
If your goal is ongoing monitoring, one page is usually not enough. Paginating is straightforward because Stack Overflow exposes a page= query parameter.
from urllib.parse import urlencode
def newest_url(page: int = 1, pagesize: int = 50) -> str:
return BASE + "/questions?" + urlencode({
"tab": "Newest",
"page": page,
"pagesize": pagesize,
})
def crawl_newest(*, pages: int = 3, pagesize: int = 50) -> list[QuestionRow]:
client = HttpClient()
seen_urls: set[str] = set()
all_rows: list[QuestionRow] = []
for page in range(1, pages + 1):
html = client.get_html(newest_url(page=page, pagesize=pagesize))
batch = parse_listing(html)
for row in batch:
if row.url in seen_urls:
continue
seen_urls.add(row.url)
all_rows.append(row)
print(f"page={page} batch={len(batch)} total={len(all_rows)}")
time.sleep(1.0)
return all_rows
The seen_urls set matters because feeds can overlap while new questions arrive.
Step 4: Export to CSV
CSV keeps the output easy to inspect in Excel, Google Sheets, or a BI import.
import csv
from pathlib import Path
def save_csv(rows: list[QuestionRow], path: str = "stack_overflow_newest.csv") -> Path:
output = Path(path)
output.parent.mkdir(parents=True, exist_ok=True)
dict_rows = [asdict(row) for row in rows]
with output.open("w", newline="", encoding="utf-8") as fh:
writer = csv.DictWriter(fh, fieldnames=list(dict_rows[0].keys()))
writer.writeheader()
writer.writerows(dict_rows)
return output
Full run:
def main() -> None:
rows = crawl_newest(pages=2, pagesize=30)
csv_path = save_csv(rows, "data/stack-overflow-newest.csv")
print(f"saved {len(rows)} questions to {csv_path}")
print(asdict(rows[0]))
if __name__ == "__main__":
main()
Why this dataset is useful
A raw archive of newest questions is useful for:
- spotting fresh error messages before keyword tools catch up
- tracking tag velocity for specific ecosystems
- building lead indicators for docs, tutorials, or support content
- detecting which frameworks are generating the most unanswered confusion
The feed is especially helpful when you filter later by tag, for example python, pandas, playwright, or next.js.
Adding ProxiesAPI without changing the parser
For a small crawl, direct requests are usually enough. Once you run this continuously, route only the fetch layer through ProxiesAPI and leave the parsing logic alone.
That separation pays off because:
| Concern | Lives in the fetch layer | Lives in the parser |
|---|---|---|
| retries and timeouts | yes | no |
| proxy routing | yes | no |
| CSS selectors | no | yes |
| CSV field mapping | no | yes |
This is the simplest way to avoid turning a working scraper into a tangled script.
Practical extensions
- Add a tag filter and scrape only questions that match a shortlist you care about.
- Follow each question URL and capture accepted answers in a second-stage job.
- Store a hash of title + URL so you can run hourly without duplicates.
- Send rows into SQLite or Postgres if CSV becomes too limiting.
If your only goal is "newest questions into CSV," the code above is enough. The next layer is analysis and alerting, not more scraping complexity.
Newest-question feeds are easy to scrape in small batches. Once you monitor many tags or archive questions continuously, ProxiesAPI helps keep the fetch layer predictable without changing your parser.