Scrape GitHub Repository Data (Stars, Releases, Issues) with Python + ProxiesAPI
GitHub has a great API — but sometimes you still need to scrape the actual HTML:
- a site you’re integrating with only links to repo pages
- you want to parse a repo list from an org page
- you need fields that differ across API versions / auth scopes
- you want a lightweight “no-token” scraper for quick analysis
In this guide we’ll build a Python scraper that extracts:
- stars and forks
- open issues and pull requests counts
- latest release tag (if any)
- a small list of recent issues (title + url + labels)
…and exports results to CSV.

GitHub pages are fast, but repeated scraping can hit rate limits or blocking. ProxiesAPI helps you keep request volume predictable with a stable proxy + retry layer.
Setup
python -m venv .venv
source .venv/bin/activate
pip install requests beautifulsoup4 lxml
Step 1: Fetch with timeouts + retries (and ProxiesAPI)
Same pattern as other scrapers: build a reliable network layer first.
import os
import time
import random
from urllib.parse import urlencode
import requests
TIMEOUT = (10, 30)
MAX_RETRIES = 5
session = requests.Session()
DEFAULT_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
}
def build_proxiesapi_url(target_url: str) -> str:
api_key = os.environ.get("PROXIESAPI_KEY")
if not api_key:
raise RuntimeError("Missing PROXIESAPI_KEY env var")
base = os.environ.get("PROXIESAPI_URL", "https://api.proxiesapi.com")
qs = urlencode({"api_key": api_key, "url": target_url})
return f"{base}/?{qs}"
def fetch(url: str, *, use_proxiesapi: bool = True) -> str:
for attempt in range(1, MAX_RETRIES + 1):
try:
final_url = build_proxiesapi_url(url) if use_proxiesapi else url
r = session.get(final_url, headers=DEFAULT_HEADERS, timeout=TIMEOUT)
# GitHub may return 429/503 under load.
if r.status_code in (429, 503):
raise RuntimeError(f"transient status {r.status_code}")
r.raise_for_status()
return r.text
except Exception as e:
if attempt == MAX_RETRIES:
raise
sleep_s = min(20, 2 ** (attempt - 1)) + random.uniform(0, 0.5)
print(f"fetch failed ({attempt}/{MAX_RETRIES}): {e} — sleeping {sleep_s:.1f}s")
time.sleep(sleep_s)
Step 2: Understand what to scrape on a repo page
A GitHub repository page lives at:
https://github.com/{owner}/{repo}
You’ll typically find:
- Stars and Forks as “counter” elements near the top
- “Issues” and “Pull requests” tabs with count badges
- “Releases” area (sometimes) with a latest tag
The exact DOM shifts over time, so we’ll use a few conservative selector strategies.
Step 3: Parse repo metadata (defensive selectors)
import re
from bs4 import BeautifulSoup
def clean_int(text: str) -> int | None:
if not text:
return None
t = text.strip().lower().replace(",", "")
# GitHub sometimes shows 1.2k, 3.4m
m = re.match(r"^(\d+(?:\.\d+)?)([km])?$", t)
if m:
num = float(m.group(1))
suf = m.group(2)
if suf == "k":
return int(num * 1000)
if suf == "m":
return int(num * 1_000_000)
return int(num)
m2 = re.search(r"(\d+)", t)
return int(m2.group(1)) if m2 else None
def pick_counter(soup: BeautifulSoup, href_contains: str) -> int | None:
# Common: <a href="/.../stargazers"> <span class="Counter">...</span>
a = soup.select_one(f'a[href*="{href_contains}"] .Counter')
if a:
return clean_int(a.get_text(" ", strip=True))
# Fallback: any link containing the substring
a2 = soup.select_one(f'a[href*="{href_contains}"]')
if a2:
return clean_int(a2.get_text(" ", strip=True))
return None
def parse_repo_page(html: str) -> dict:
soup = BeautifulSoup(html, "lxml")
# Repo name (best-effort)
h1 = soup.select_one("strong.mr-2.flex-self-stretch a") or soup.select_one("h1")
repo_name = h1.get_text(" ", strip=True) if h1 else None
stars = pick_counter(soup, "/stargazers")
forks = pick_counter(soup, "/forks")
issues = pick_counter(soup, "/issues")
pulls = pick_counter(soup, "/pulls")
# Latest release tag: often appears in a[href*="/releases/tag/"]
rel = soup.select_one('a[href*="/releases/tag/"]')
latest_release = rel.get_text(" ", strip=True) if rel else None
return {
"repo": repo_name,
"stars": stars,
"forks": forks,
"open_issues": issues,
"open_pulls": pulls,
"latest_release": latest_release,
}
url = "https://github.com/psf/requests"
html = fetch(url)
meta = parse_repo_page(html)
print(meta)
Step 4: Scrape a small set of recent issues
Issues live at:
https://github.com/{owner}/{repo}/issues
We’ll parse the first page of issues (title, url, labels). This is enough for many dashboards.
from urllib.parse import urljoin
def parse_issues_list(html: str, base: str = "https://github.com") -> list[dict]:
soup = BeautifulSoup(html, "lxml")
out = []
# Issue titles are usually links with href containing "/issues/".
# We also avoid PR links.
for a in soup.select('a[href*="/issues/"]'):
href = a.get("href") or ""
if "/pull/" in href:
continue
title = a.get_text(" ", strip=True)
if not title or len(title) < 3:
continue
row = a
for _ in range(5):
if not row:
break
if row.name in ("div", "li") and "js-issue-row" in (row.get("class") or []):
break
row = row.parent
container = row if row else a.parent
labels = []
if container:
for lab in container.select('a[href*="/labels/"]'):
t = lab.get_text(" ", strip=True)
if t:
labels.append(t)
out.append({
"title": title,
"url": urljoin(base, href),
"labels": labels,
})
# Deduplicate by URL
uniq = []
seen = set()
for it in out:
if it["url"] in seen:
continue
seen.add(it["url"])
uniq.append(it)
return uniq
issues_html = fetch("https://github.com/psf/requests/issues")
issues = parse_issues_list(issues_html)
print("issues:", len(issues))
print(issues[:3])
Step 5: Save to CSV (repo + issues)
import csv
def write_repo_csv(path: str, row: dict) -> None:
fields = ["repo", "stars", "forks", "open_issues", "open_pulls", "latest_release"]
with open(path, "w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=fields)
w.writeheader()
w.writerow({k: row.get(k) for k in fields})
def write_issues_csv(path: str, rows: list[dict]) -> None:
fields = ["title", "url", "labels"]
with open(path, "w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=fields)
w.writeheader()
for r in rows:
w.writerow({
"title": r.get("title"),
"url": r.get("url"),
"labels": ",".join(r.get("labels") or []),
})
write_repo_csv("github_repo.csv", meta)
write_issues_csv("github_issues.csv", issues[:50])
print("wrote github_repo.csv and github_issues.csv")
Rate limits: what to expect
GitHub is generally scrape-friendly for low volume, but at scale you can hit:
- 429 / 503 responses
- pages that load slower
- blocks if you behave like a bot (no UA, no backoff, high concurrency)
Keep it boring:
- low concurrency (start with 1)
- backoff on 429/503
- cache results and only refresh periodically
Where ProxiesAPI fits (honestly)
If you’re scraping a handful of repos, you may not need proxies.
ProxiesAPI becomes useful when:
- you crawl lots of repos nightly
- you scrape org repo lists + every repo detail page
- you run scrapes from a fixed server IP that gets throttled
QA checklist
-
parse_repo_page()returns sane integers for stars/forks - issues parsing returns titles + urls
- CSV output opens correctly in Excel/Sheets
Next upgrades
- parse README text for keywords
- detect default branch and scrape
/commitsactivity - store snapshots in SQLite + diff over time
GitHub pages are fast, but repeated scraping can hit rate limits or blocking. ProxiesAPI helps you keep request volume predictable with a stable proxy + retry layer.