Scrape GitHub Repository Data
GitHub has a great API — but sometimes you still need to scrape the actual HTML:
- a site you’re integrating with only links to repo pages
- you want to parse a repo list from an org page
- you need fields that differ across API versions / auth scopes
- you want a lightweight “no-token” scraper for quick analysis
In this guide we’ll build a Python scraper that extracts:
- stars and forks
- repository topics
- short repo description / about text
- README-linked context you can feed into downstream analysis
…and exports results to CSV.

GitHub pages are fast, but repeated scraping can hit rate limits or blocking. ProxiesAPI helps you keep request volume predictable with a stable proxy + retry layer.
Setup
python -m venv .venv
source .venv/bin/activate
pip install requests beautifulsoup4 lxml
Step 1: Fetch with timeouts + retries (and ProxiesAPI)
Same pattern as other scrapers: build a reliable network layer first.
import os
import time
import random
from urllib.parse import urlencode
import requests
TIMEOUT = (10, 30)
MAX_RETRIES = 5
session = requests.Session()
DEFAULT_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
}
def build_proxiesapi_url(target_url: str) -> str:
api_key = os.environ.get("PROXIESAPI_KEY")
if not api_key:
raise RuntimeError("Missing PROXIESAPI_KEY env var")
base = os.environ.get("PROXIESAPI_URL", "https://api.proxiesapi.com")
qs = urlencode({"api_key": api_key, "url": target_url})
return f"{base}/?{qs}"
def fetch(url: str, *, use_proxiesapi: bool = True) -> str:
for attempt in range(1, MAX_RETRIES + 1):
try:
final_url = build_proxiesapi_url(url) if use_proxiesapi else url
r = session.get(final_url, headers=DEFAULT_HEADERS, timeout=TIMEOUT)
# GitHub may return 429/503 under load.
if r.status_code in (429, 503):
raise RuntimeError(f"transient status {r.status_code}")
r.raise_for_status()
return r.text
except Exception as e:
if attempt == MAX_RETRIES:
raise
sleep_s = min(20, 2 ** (attempt - 1)) + random.uniform(0, 0.5)
print(f"fetch failed ({attempt}/{MAX_RETRIES}): {e} — sleeping {sleep_s:.1f}s")
time.sleep(sleep_s)
Step 2: Understand what to scrape on a repo page
A GitHub repository page lives at:
https://github.com/{owner}/{repo}
You’ll typically find:
- Stars and Forks as “counter” elements near the top
- “Issues” and “Pull requests” tabs with count badges
- “Releases” area (sometimes) with a latest tag
The exact DOM shifts over time, so we’ll use a few conservative selector strategies.
Step 3: Parse repo metadata (defensive selectors)
import re
from bs4 import BeautifulSoup
def clean_int(text: str) -> int | None:
if not text:
return None
t = text.strip().lower().replace(",", "")
# GitHub sometimes shows 1.2k, 3.4m
m = re.match(r"^(\d+(?:\.\d+)?)([km])?$", t)
if m:
num = float(m.group(1))
suf = m.group(2)
if suf == "k":
return int(num * 1000)
if suf == "m":
return int(num * 1_000_000)
return int(num)
m2 = re.search(r"(\d+)", t)
return int(m2.group(1)) if m2 else None
def pick_counter(soup: BeautifulSoup, href_contains: str) -> int | None:
# Common: <a href="/.../stargazers"> <span class="Counter">...</span>
a = soup.select_one(f'a[href*="{href_contains}"] .Counter')
if a:
return clean_int(a.get_text(" ", strip=True))
# Fallback: any link containing the substring
a2 = soup.select_one(f'a[href*="{href_contains}"]')
if a2:
return clean_int(a2.get_text(" ", strip=True))
return None
def parse_repo_page(html: str) -> dict:
soup = BeautifulSoup(html, "lxml")
# Repo name (best-effort)
h1 = soup.select_one("strong.mr-2.flex-self-stretch a") or soup.select_one("h1")
repo_name = h1.get_text(" ", strip=True) if h1 else None
stars = pick_counter(soup, "/stargazers")
forks = pick_counter(soup, "/forks")
topics = []
seen_topics = set()
for a in soup.select('a[href*="/topics/"]'):
topic = a.get_text(" ", strip=True)
if topic and topic not in seen_topics:
seen_topics.add(topic)
topics.append(topic)
about = None
about_el = soup.select_one('p.f4.my-3') or soup.select_one('p.f4.mt-3')
if about_el:
about = about_el.get_text(" ", strip=True)
readme_link = None
readme_a = soup.select_one('a[href$="#readme"]') or soup.select_one('a[href*="#readme"]')
if readme_a:
href = readme_a.get("href")
if href:
readme_link = href if href.startswith("http") else f"https://github.com{href}"
return {
"repo": repo_name,
"stars": stars,
"forks": forks,
"topics": topics,
"about": about,
"readme_url": readme_link,
}
url = "https://github.com/psf/requests"
html = fetch(url)
meta = parse_repo_page(html)
print(meta)
Step 4: Pull README-linked context
For enrichment workflows, the README is often the most useful human-written context on the page. We do not need the entire markdown blob in every export. A small structured summary is enough: section headings and outbound docs links.
from urllib.parse import urljoin
def parse_readme_context(html: str, base: str = "https://github.com") -> dict:
soup = BeautifulSoup(html, "lxml")
article = soup.select_one('article.markdown-body') or soup.select_one('#readme article')
if not article:
return {"headings": [], "links": []}
headings = [
el.get_text(" ", strip=True)
for el in article.select("h1, h2, h3")
if el.get_text(" ", strip=True)
]
links = []
seen = set()
for a in article.select("a[href]"):
href = urljoin(base, a.get("href"))
label = a.get_text(" ", strip=True)
if href not in seen and label:
seen.add(href)
links.append({"label": label, "url": href})
return {"headings": headings[:20], "links": links[:20]}
readme_html = fetch(meta["readme_url"]) if meta.get("readme_url") else html
readme_context = parse_readme_context(readme_html)
print(readme_context)
Step 5: Save to CSV (repo + README context)
import csv
def write_repo_csv(path: str, row: dict) -> None:
fields = ["repo", "stars", "forks", "topics", "about", "readme_url"]
with open(path, "w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=fields)
w.writeheader()
w.writerow({
"repo": row.get("repo"),
"stars": row.get("stars"),
"forks": row.get("forks"),
"topics": ",".join(row.get("topics") or []),
"about": row.get("about"),
"readme_url": row.get("readme_url"),
})
def write_readme_links_csv(path: str, rows: list[dict]) -> None:
fields = ["label", "url"]
with open(path, "w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=fields)
w.writeheader()
for r in rows:
w.writerow({"label": r.get("label"), "url": r.get("url")})
write_repo_csv("github_repo.csv", meta)
write_readme_links_csv("github_readme_links.csv", readme_context["links"])
print("wrote github_repo.csv and github_readme_links.csv")
Rate limits: what to expect
GitHub is generally scrape-friendly for low volume, but at scale you can hit:
- 429 / 503 responses
- pages that load slower
- blocks if you behave like a bot (no UA, no backoff, high concurrency)
Keep it boring:
- low concurrency (start with 1)
- backoff on 429/503
- cache results and only refresh periodically
Where ProxiesAPI fits (honestly)
If you’re scraping a handful of repos, you may not need proxies.
ProxiesAPI becomes useful when:
- you crawl lots of repos nightly
- you scrape org repo lists + every repo detail page
- you run scrapes from a fixed server IP that gets throttled
QA checklist
-
parse_repo_page()returns sane integers for stars/forks - issues parsing returns titles + urls
- CSV output opens correctly in Excel/Sheets
Next upgrades
- parse README text for keywords
- detect default branch and scrape
/commitsactivity - store snapshots in SQLite + diff over time
GitHub pages are fast, but repeated scraping can hit rate limits or blocking. ProxiesAPI helps you keep request volume predictable with a stable proxy + retry layer.