Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from typing import List, TypedDict | |
| import huggingface_hub | |
| from huggingface_hub.hf_api import SpaceInfo | |
| from concurrent.futures import ThreadPoolExecutor | |
| import os | |
| import json | |
| import datetime | |
| import tqdm | |
| import requests | |
| from pathlib import Path | |
| from screenshot import get_screen_shot | |
| import boto3 | |
| from threading import Lock | |
| class SpaceData(TypedDict): | |
| id: str | |
| likes: int | |
| subdomain: str | |
| lastModified: str | |
| status: str | |
| repo = huggingface_hub.Repository( | |
| local_dir="data", | |
| repo_type="dataset", | |
| clone_from="freddyaboulton/gradio-theme-subdomains", | |
| token=os.getenv("HF_TOKEN"), | |
| ) | |
| repo.git_pull() | |
| prev_data = {s['id']: s for s in json.load(open("data/val_subdomains.json"))} | |
| screen_shot_dir = Path("data") / "images" | |
| screen_shot_dir.mkdir(exist_ok=True, parents=True) | |
| s3_client = boto3.client( | |
| "s3", | |
| aws_access_key_id=os.getenv("AWS_ACCESS_KEY"), | |
| aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), | |
| ) | |
| lock = Lock() | |
| api = huggingface_hub.HfApi(token=os.getenv("HF_TOKEN")) | |
| def get_theme_preview_spaces() -> List[SpaceInfo]: | |
| return list(iter(api.list_spaces(filter="gradio-theme"))) | |
| def get_info(space_name: SpaceInfo) -> SpaceData | None: | |
| if not space_name.id: | |
| print(f"no space_name for {space_name}") | |
| return None | |
| space_info = api.space_info(space_name.id, token=os.getenv("HF_TOKEN")) | |
| if space_info.private: | |
| print(f"{space_name} is private") | |
| return None | |
| subdomain: str | None = getattr(space_info, "subdomain", None) | |
| if subdomain is None: | |
| print(f"no subdomain for {space_info.id}") | |
| return None | |
| status = space_info.runtime.stage | |
| img_id = space_info.id.replace("/", "_") | |
| light_file = str(screen_shot_dir / Path(img_id + "_light.jpg")) | |
| dark_file = str(screen_shot_dir / Path(img_id + "_dark.jpg")) | |
| if False: #status == "RUNNING": | |
| if not prev_data.get(space_info.id, {}).get("sha") or (prev_data.get(space_info.id, {}).get("sha") != space_info.sha): | |
| prev_data[space_info.id]['sha'] = space_info.sha | |
| with lock: | |
| get_screen_shot( | |
| f"https://{space_info.subdomain}.hf.space?__theme=light", 3, light_file | |
| ) | |
| with lock: | |
| get_screen_shot( | |
| f"https://{space_info.subdomain}.hf.space?__theme=dark", 3, dark_file | |
| ) | |
| s3_client.upload_file( | |
| light_file, | |
| "gradio-theme-screenshots", | |
| img_id + "_light.jpg", | |
| ExtraArgs={"ContentType": "image/jpg"}, | |
| ) | |
| s3_client.upload_file( | |
| dark_file, | |
| "gradio-theme-screenshots", | |
| img_id + "_dark.jpg", | |
| ExtraArgs={"ContentType": "image/jpg"}, | |
| ) | |
| if status not in ["SLEEPING", "RUNNING", "RUNNING_BUILDING", "BUILDING"]: | |
| print(f"Space not running, building, or sleeping {space_info.id}") | |
| elif status == "SLEEPING": | |
| requests.get(f"https://huggingface.co/spaces/{space_info.id}") | |
| return { | |
| "id": space_info.id, | |
| "likes": space_info.likes, | |
| "sha": space_info.sha, | |
| "lastModified": space_info.lastModified.strftime("%Y-%m-%d"), | |
| "screenshot_id": img_id, | |
| "status": status, | |
| "subdomain": f"https://{space_info.subdomain}.hf.space/" | |
| } # type: ignore | |
| def get_all_info(spaces: List[SpaceInfo]) -> List[SpaceData]: | |
| with ThreadPoolExecutor(max_workers=10) as executor: | |
| all_info = list(tqdm.tqdm(executor.map(get_info, spaces), total=len(spaces))) | |
| return [info for info in all_info if info] | |
| def process_spaces(): | |
| theme_spaces = list(iter(get_theme_preview_spaces())) | |
| all_info = get_all_info(theme_spaces) | |
| json.dump(all_info, open("data/subdomains.json", "w")) | |
| repo.push_to_hub( | |
| blocking=False, commit_message=f"Updating data at {datetime.datetime.now()}" | |
| ) | |