Scrape GitHub Repository Data (Stars, Releases, Issues) with Python + ProxiesAPI
GitHub has an excellent public API — but lots of useful data is easy to extract from HTML too:
- stars / forks
- “Used by” counts
- latest release tag
- open issues / PR counts
Scraping HTML is sometimes the right move when:
- you want a single pipeline for many sites (not a special API client per site)
- you need the exact same numbers users see on the page
- you’re building a lightweight dataset quickly
In this guide we’ll build a real Python scraper that:
- fetches a repo page (and optional releases/issues pages) via ProxiesAPI
- parses key fields with resilient selectors
- exports the results to CSV

GitHub is usually scrapable, but high request volume can trigger rate limits. ProxiesAPI helps you spread requests and keep your fetch layer consistent while you focus on parsing and data quality.
Before you scrape GitHub (practical constraints)
GitHub is on the Green List for ProxiesAPI in this project — meaning it has been tested to return real HTML through the ProxiesAPI fetch layer.
Still, expect:
- rate limits (especially on unauthenticated traffic)
- occasionally different markup for logged-in vs logged-out states
- A/B tests
So our scraper will:
- use timeouts
- retry with exponential backoff
- treat missing elements as normal (return
Nonerather than crashing)
Setup
python -m venv .venv
source .venv/bin/activate
pip install requests beautifulsoup4 lxml
ProxiesAPI fetch helper (with backoff)
import os
import random
import time
import urllib.parse
import requests
PROXIESAPI_KEY = os.getenv("PROXIESAPI_KEY", "YOUR_KEY")
PROXIESAPI_ENDPOINT = "http://api.proxiesapi.com/"
TIMEOUT = (10, 40)
session = requests.Session()
session.headers.update(
{
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/123.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
}
)
def proxiesapi_url(target_url: str) -> str:
return (
f"{PROXIESAPI_ENDPOINT}?auth_key={PROXIESAPI_KEY}"
f"&url={urllib.parse.quote(target_url, safe='')}"
)
def fetch_html(target_url: str, *, max_retries: int = 5) -> str:
last = None
for attempt in range(1, max_retries + 1):
try:
r = session.get(proxiesapi_url(target_url), timeout=TIMEOUT)
if r.status_code >= 400:
raise requests.HTTPError(f"HTTP {r.status_code}")
text = r.text
if len(text) < 2000:
raise ValueError(f"Suspiciously small HTML ({len(text)} bytes)")
return text
except Exception as e:
last = e
time.sleep(min(10, 1.6 ** attempt) + random.uniform(0, 0.5))
raise RuntimeError(f"Failed to fetch {target_url}: {last}")
What we’ll extract
Given a repo URL like:
https://github.com/psf/requests
We’ll extract:
owner,repostars,forksissues_open,pulls_openlatest_release_tag(best effort)
We’ll do it with HTML parsing and “try multiple selectors” logic, because GitHub markup evolves.
Parsing helpers
import re
from bs4 import BeautifulSoup
def clean(s: str) -> str:
return re.sub(r"\s+", " ", (s or "")).strip()
def first_text(soup, selectors: list[str]) -> str | None:
for sel in selectors:
node = soup.select_one(sel)
if node:
t = clean(node.get_text(" ", strip=True))
if t:
return t
return None
def parse_count(text: str | None) -> int | None:
if not text:
return None
t = text.lower().replace(",", "").strip()
# GitHub shows 1.2k / 3.4k etc
m = re.match(r"([0-9]*\.?[0-9]+)\s*([km])?", t)
if not m:
# try to find any integer
m2 = re.search(r"(\d+)", t)
return int(m2.group(1)) if m2 else None
num = float(m.group(1))
suf = m.group(2)
if suf == "k":
return int(num * 1000)
if suf == "m":
return int(num * 1_000_000)
return int(num)
Parse a GitHub repo page (HTML)
We’ll focus on selectors that are commonly present:
- Stars/forks often appear as links whose
hrefends in/stargazersand/forks. - Issues/PR counts appear in the repo navigation.
from urllib.parse import urlparse
def parse_repo_page(html: str, repo_url: str) -> dict:
soup = BeautifulSoup(html, "lxml")
# Owner/repo from URL (more stable than HTML)
path = urlparse(repo_url).path.strip("/")
owner, repo = (path.split("/") + [None, None])[:2]
stars_text = first_text(
soup,
[
'a[href$="/stargazers"]',
'a[href$="/stargazers"] span',
'a[href$="/stargazers"] strong',
],
)
forks_text = first_text(
soup,
[
'a[href$="/forks"]',
'a[href$="/forks"] span',
'a[href$="/forks"] strong',
],
)
# Issues and PR counts: try nav links
issues_text = first_text(
soup,
[
'a[href$="/issues"] span.Counter',
'a[id*="issues-tab"] span.Counter',
],
)
pulls_text = first_text(
soup,
[
'a[href$="/pulls"] span.Counter',
'a[id*="pull-requests-tab"] span.Counter',
],
)
# Latest release tag: try repo page widget, then releases page.
latest_release = first_text(
soup,
[
'a[href*="/releases/tag/"]',
'a[href$="/releases"] span',
],
)
return {
"repo_url": repo_url,
"owner": owner,
"repo": repo,
"stars": parse_count(stars_text),
"forks": parse_count(forks_text),
"issues_open": parse_count(issues_text),
"pulls_open": parse_count(pulls_text),
"latest_release_tag": latest_release,
}
If latest_release_tag is None, we’ll fetch /releases next.
Fetch releases page to extract the latest tag
def parse_latest_release_from_releases_page(html: str) -> str | None:
soup = BeautifulSoup(html, "lxml")
# The first release tag link is usually the latest.
tag = first_text(soup, ['a[href*="/releases/tag/"]'])
if not tag:
return None
# tag text sometimes includes whitespace; keep it clean.
return tag.split()[0]
def enrich_latest_release(row: dict) -> dict:
if row.get("latest_release_tag"):
return row
owner = row.get("owner")
repo = row.get("repo")
if not owner or not repo:
return row
releases_url = f"https://github.com/{owner}/{repo}/releases"
html = fetch_html(releases_url)
row["latest_release_tag"] = parse_latest_release_from_releases_page(html)
return row
Put it together: scrape a list of repos → CSV
import csv
def scrape_repo(repo_url: str) -> dict:
html = fetch_html(repo_url)
row = parse_repo_page(html, repo_url)
row = enrich_latest_release(row)
return row
def write_csv(rows: list[dict], path: str = "github_repos.csv") -> None:
fieldnames = [
"repo_url",
"owner",
"repo",
"stars",
"forks",
"issues_open",
"pulls_open",
"latest_release_tag",
]
with open(path, "w", encoding="utf-8", newline="") as f:
w = csv.DictWriter(f, fieldnames=fieldnames)
w.writeheader()
for r in rows:
w.writerow({k: r.get(k) for k in fieldnames})
if __name__ == "__main__":
repos = [
"https://github.com/psf/requests",
"https://github.com/pallets/flask",
"https://github.com/tiangolo/fastapi",
]
rows = []
for url in repos:
print("scraping", url)
rows.append(scrape_repo(url))
print(rows)
write_csv(rows)
print("wrote github_repos.csv")
Handling GitHub rate limits (without drama)
If you scale this up (hundreds/thousands of repos), you’ll hit rate limits.
Practical mitigation:
- add sleep + jitter between requests
- use a cache (don’t re-fetch unchanged repos daily)
- separate “repo list discovery” from “repo details crawl”
- treat
Nonevalues as expected — don’t crash on missing fields
If you need exact, stable numbers at scale, GitHub’s API is still the right tool — but HTML scraping can be a fast, flexible alternative for many workflows.
Where ProxiesAPI fits (honestly)
GitHub is often scrapable without proxies.
But once you crawl many pages (repo list → repo page → releases → issues), failure rate rises.
ProxiesAPI helps you keep the fetch side consistent. Your code still needs good hygiene (timeouts, retries, backoff), and your parser still needs maintenance — but your scraping pipeline becomes far less fragile.
GitHub is usually scrapable, but high request volume can trigger rate limits. ProxiesAPI helps you spread requests and keep your fetch layer consistent while you focus on parsing and data quality.