#!/usr/bin/env python3 """ fetch_portraits.py Download portrait images for every person in people.json -→ ./images/people/… How it works ------------ 1. Wikipedia API : get the Wikidata Q-id for the page about <name> 2. Wikidata API : fetch property P18 (image filename) for that Q-id 3. Commons file : resolve File:… to a real image URL via Special:FilePath 4. Save locally : mkdir -p images/people && write binary data If anything fails we log and continue. """ import json import os import pathlib import sys import time from urllib.parse import quote_plus import requests from tqdm import tqdm WIKI_API = "https://en.wikipedia.org/w/api.php" WIKIDATA_API = "https://www.wikidata.org/wiki/Special:EntityData/{}.json" COMMONS_FILE = "https://commons.wikimedia.org/wiki/Special:FilePath/{}" HEADERS = {"User-Agent": "ShrineTimelineBot/0.1 (https://github.com/shrine)"} # --------------------------------------------------------------------------- # # Helper functions # --------------------------------------------------------------------------- # def wikipedia_to_qid(title: str) -> str | None: """Return the Wikidata Q-identifier for a Wikipedia page title.""" params = { "action": "query", "format": "json", "titles": title, "prop": "pageprops", "redirects": 1, } r = requests.get(WIKI_API, params=params, headers=HEADERS, timeout=15) r.raise_for_status() pages = r.json()["query"]["pages"].values() for page in pages: return page.get("pageprops", {}).get("wikibase_item") # e.g. "Q937" return None def qid_to_image_filename(qid: str) -> str | None: """Return the Commons file name (e.g. 'Albert Einstein Head.jpg').""" r = requests.get(WIKIDATA_API.format(qid), headers=HEADERS, timeout=15) r.raise_for_status() entity = r.json()["entities"][qid] claims = entity["claims"] if "P18" not in claims: return None # Use the first image in P18 return claims["P18"][0]["mainsnak"]["datavalue"]["value"] from urllib.parse import quote # swap quote_plus → quote def download_commons_file(filename: str, dest_path: pathlib.Path) -> bool: """ Resolve image via Special:FilePath and stream to dest_path. Returns True if saved, False if already on disk. """ if dest_path.exists(): return False # 1. MediaWiki expects underscores, not spaces fname = filename.replace(" ", "_") # 2. URL-encode *except* underscores, parentheses, apostrophes, dots, dashes # (keeps filenames readable and avoids double-encoding) safe = "_().'-" url = f"https://commons.wikimedia.org/wiki/Special:FilePath/{quote(fname, safe=safe)}" with requests.get(url, headers=HEADERS, stream=True, timeout=30) as r: r.raise_for_status() # will now 302 → 200 instead of 404 dest_path.parent.mkdir(parents=True, exist_ok=True) with open(dest_path, "wb") as fp: for chunk in r.iter_content(chunk_size=8192): fp.write(chunk) return True # --------------------------------------------------------------------------- # # Main script # --------------------------------------------------------------------------- # def main(json_path="people.json"): people = json.loads(pathlib.Path(json_path).read_text()) saved, skipped, err = 0, 0, 0 for person in tqdm(people, unit="person"): title = person["name"] out_path = pathlib.Path("." + person["image"]) # strip leading slash try: qid = wikipedia_to_qid(title) if not qid: tqdm.write(f"âš ï¸ {title}: no Wikidata Q-id"); err += 1; continue filename = qid_to_image_filename(qid) if not filename: tqdm.write(f"âš ï¸ {title}: no P18 image"); err += 1; continue if download_commons_file(filename, out_path): saved += 1 else: skipped += 1 # already existed time.sleep(0.3) # be polite to the APIs except Exception as e: tqdm.write(f"⌠{title}: {e}") err += 1 print(f"\nDone: {saved} downloaded, {skipped} skipped, {err} errors.") if __name__ == "__main__": if not pathlib.Path("people.json").exists(): sys.exit("people.json not found in current directory.") main("people.json")