Scrape Hacker News Show HN Posts into a Launch Monitor
Show HN is one of the fastest ways to spot new developer tools, indie launches, and early product positioning shifts.
The problem is that browsing it manually is noisy. Posts disappear down the page quickly, and if you want a durable feed you need to capture the metadata yourself.
In this tutorial we will scrape the Show HN page into a launch monitor that stores:
- title
- outbound URL
- launch domain
- points
- comment count
- author
- age
- Hacker News item URL
Mandatory screenshot of the page we are scraping:

Hacker News itself is simple. But launch monitoring often spreads into linked product sites, directories, and docs pages. ProxiesAPI helps when that wider fetch layer gets noisy.
What we are scraping
The Show HN list is here:
The current HTML is simple and stable:
- each post row is tr.athing.submission
- the next row holds points, author, age, and comment count inside td.subtext
- the next page link is a.morelink[rel="next"]
That makes HN a good target for a lightweight monitor.
Setup
python3 -m venv .venv
source .venv/bin/activate
pip install requests beautifulsoup4 lxml pandas
Step 1: fetch the page
We will keep the fetch layer simple, but still production-safe: real timeout, user-agent, and optional ProxiesAPI wrapper.
import os
import random
import time
import urllib.parse
from urllib.parse import urlparse
import requests
BASE = "https://news.ycombinator.com"
TIMEOUT = (10, 30)
PROXIESAPI_KEY = os.getenv("PROXIESAPI_KEY", "")
session = requests.Session()
session.headers.update(
{
"User-Agent": "Mozilla/5.0 (compatible; ProxiesAPI-Guides/1.0)",
"Accept-Language": "en-US,en;q=0.9",
}
)
def proxiesapi_url(target_url: str) -> str:
if not PROXIESAPI_KEY:
raise RuntimeError("Set PROXIESAPI_KEY before using ProxiesAPI")
return (
"http://api.proxiesapi.com/?auth_key="
+ urllib.parse.quote(PROXIESAPI_KEY, safe="")
+ "&url="
+ urllib.parse.quote(target_url, safe="")
)
def fetch(url: str, *, use_proxiesapi: bool = False, max_retries: int = 4) -> str:
final_url = proxiesapi_url(url) if use_proxiesapi else url
last_err = None
for attempt in range(1, max_retries + 1):
try:
response = session.get(final_url, timeout=TIMEOUT)
response.raise_for_status()
html = response.text or ""
if 'tr class="athing submission"' not in html and "athing submission" not in html:
raise RuntimeError("Expected Show HN rows not found")
return html
except Exception as exc:
last_err = exc
time.sleep(min(8, 2 ** (attempt - 1)) + random.random())
raise RuntimeError(f"Fetch failed after retries: {last_err}")
Step 2: parse one Show HN entry
Each post is split over two table rows:
- the submission row holds the title and outbound link
- the following subtext row holds points, author, age, and comments
import re
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def parse_int(text: str) -> int | None:
match = re.search(r"(\\d+)", text or "")
return int(match.group(1)) if match else None
def extract_domain(url: str | None) -> str | None:
if not url:
return None
return urlparse(url).netloc or None
def parse_show_row(row) -> dict | None:
item_id = row.get("id")
title_a = row.select_one("span.titleline > a")
if not title_a:
return None
title = title_a.get_text(" ", strip=True)
href = title_a.get("href")
outbound_url = urljoin(BASE, href) if href else None
subtext_row = row.find_next_sibling("tr")
subtext = subtext_row.select_one("td.subtext") if subtext_row else None
points = None
author = None
age = None
comments = 0
if subtext:
score = subtext.select_one("span.score")
points = parse_int(score.get_text(" ", strip=True) if score else "")
author_a = subtext.select_one("a.hnuser")
author = author_a.get_text(" ", strip=True) if author_a else None
age_a = subtext.select_one("span.age a")
age = age_a.get_text(" ", strip=True) if age_a else None
for link in subtext.select("a"):
label = link.get_text(" ", strip=True).lower()
if "comment" in label or label == "discuss":
comments = parse_int(label) or 0
break
return {
"id": item_id,
"title": title,
"outbound_url": outbound_url,
"domain": extract_domain(outbound_url),
"points": points,
"author": author,
"age": age,
"comments": comments,
"item_url": f"{BASE}/item?id={item_id}" if item_id else None,
}
Step 3: parse the page and paginate
def parse_show_page(html: str) -> tuple[list[dict], str | None]:
soup = BeautifulSoup(html, "lxml")
launches = []
for row in soup.select("tr.athing.submission"):
parsed = parse_show_row(row)
if parsed:
launches.append(parsed)
next_link = soup.select_one('a.morelink[rel="next"]')
next_url = urljoin(BASE + "/", next_link.get("href")) if next_link else None
return launches, next_url
def crawl_show_hn(start_url: str = f"{BASE}/show", *, max_pages: int = 3, use_proxiesapi: bool = False) -> list[dict]:
rows = []
seen = set()
url = start_url
pages = 0
while url and pages < max_pages:
pages += 1
html = fetch(url, use_proxiesapi=use_proxiesapi)
batch, url = parse_show_page(html)
for item in batch:
key = item["id"]
if not key or key in seen:
continue
seen.add(key)
rows.append(item)
time.sleep(0.8 + random.random())
return rows
Step 4: turn it into a useful launch monitor
Raw scraping is not the point. The point is getting a feed you can actually review.
import pandas as pd
def build_launch_monitor(*, pages: int = 3, use_proxiesapi: bool = False) -> pd.DataFrame:
rows = crawl_show_hn(max_pages=pages, use_proxiesapi=use_proxiesapi)
df = pd.DataFrame(rows)
if df.empty:
return df
df["score_per_comment"] = df.apply(
lambda r: round((r["points"] or 0) / max(r["comments"] or 1, 1), 2),
axis=1,
)
df = df.sort_values(["points", "comments"], ascending=[False, False])
return df[
["title", "domain", "points", "comments", "score_per_comment", "author", "age", "outbound_url", "item_url"]
]
if __name__ == "__main__":
df = build_launch_monitor(pages=2, use_proxiesapi=False)
df.to_csv("show_hn_launch_monitor.csv", index=False)
print(df.head(10).to_string(index=False))
print(f"rows: {len(df)}")
Example output:
title domain points comments score_per_comment author age outbound_url item_url
Show HN: Uruky (EU-based Kagi alternative) now has Image Search... uruky.com 141 150 0.94 BrunoBernardino 7 hours ago https://uruky.com/?il=en https://news.ycombinator.com/item?id=48396004
Show HN: Paseo – Beautiful open-source coding agent interface github.com 85 51 1.67 timhigins 1 day ago https://github.com/getpaseo/paseo https://news.ycombinator.com/item?id=48377250
What to do with this data
Once you have a CSV or dataframe, you can build surprisingly useful downstream workflows:
- alert on new posts above a points threshold
- group launches by domain or category
- enrich outbound URLs with page titles and descriptions
- track which launches later show up on Product Hunt, GitHub Trending, or newsletters
The important design choice is this: scrape Show HN first, then fan out into linked product pages only when the post is interesting enough.
That keeps your monitor cheap and fast.
Where ProxiesAPI fits
You do not need ProxiesAPI just to scrape one HN page.
You may want it when your launch-monitor pipeline expands into:
- linked product sites with bot protection
- region-sensitive landing pages
- repeat fetches across many sources
- one shared network layer for HN, GitHub, directories, and docs
That is where ProxiesAPI starts earning its keep. Not on the cleanest source, but on the messy ones downstream.
Practical caveats
- HN exposes an official API, but the HTML is often easier when you care about the exact list view.
- discuss means zero comments; handle that explicitly.
- Respect politeness delays even on friendly sites.
If your goal is discovering launches quickly, this HTML parser is enough. It gives you titles, domains, engagement signals, and a direct path into the discussion thread.
Hacker News itself is simple. But launch monitoring often spreads into linked product sites, directories, and docs pages. ProxiesAPI helps when that wider fetch layer gets noisy.