Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ dependencies = [
"fastapi[all]",
"uvicorn",
"jinja2",
"huggingface-hub>=0.20",
"pyarrow>=14.0",
]

[project.optional-dependencies]
Expand Down
47 changes: 47 additions & 0 deletions src/kernelbot/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -675,6 +675,53 @@ async def admin_update_problems(
}


@app.post("/admin/export-hf")
async def admin_export_hf(
payload: dict,
_: Annotated[None, Depends(require_admin)],
db_context=Depends(get_db),
) -> dict:
"""Export competition submissions to a Hugging Face dataset as parquet.

Payload:
leaderboard_ids: list[int] - IDs of leaderboards to export
filename: str - parquet filename in the repo (e.g. "nvidia_nvfp4_submissions.parquet")
private: bool - if true, upload to private live repo; if false, upload to public repo (default: true)
"""
from libkernelbot.hf_export import export_to_hf

leaderboard_ids = payload.get("leaderboard_ids")
filename = payload.get("filename")
private = payload.get("private", True)

if not isinstance(leaderboard_ids, list) or not leaderboard_ids:
raise HTTPException(status_code=400, detail="leaderboard_ids must be a non-empty list of integers")
if not all(isinstance(leaderboard_id, int) for leaderboard_id in leaderboard_ids):
raise HTTPException(status_code=400, detail="leaderboard_ids must be a non-empty list of integers")
if not isinstance(filename, str) or not filename.endswith(".parquet"):
raise HTTPException(status_code=400, detail="filename must end with .parquet")
if not env.HF_TOKEN:
raise HTTPException(status_code=500, detail="HF_TOKEN not configured")

repo_id = env.HF_PUBLIC_DATASET if not private else env.HF_PRIVATE_DATASET

try:
with db_context as db:
result = export_to_hf(
db=db,
leaderboard_ids=leaderboard_ids,
repo_id=repo_id,
filename=filename,
token=env.HF_TOKEN,
private=private,
)
return {"status": "ok", **result}
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e)) from e
except Exception as e:
raise HTTPException(status_code=500, detail=f"Export failed: {e}") from e


@app.get("/leaderboards")
async def get_leaderboards(db_context=Depends(get_db)):
"""An endpoint that returns all leaderboards.
Expand Down
100 changes: 100 additions & 0 deletions src/kernelbot/cogs/admin_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,13 @@ def __init__(self, bot: "ClusterBot"):
name="set-forum-ids", description="Sets forum IDs"
)(self.set_forum_ids)

self.export_to_hf = bot.admin_group.command(
name="export-hf", description="Export competition data to Hugging Face dataset"
)(self.export_to_hf)

self._scheduled_cleanup_temp_users.start()
if env.HF_TOKEN:
self._scheduled_hf_export.start()

# --------------------------------------------------------------------------
# | HELPER FUNCTIONS |
Expand Down Expand Up @@ -873,6 +879,100 @@ async def _scheduled_cleanup_temp_users(self):
db.cleanup_temp_users()
logger.info("Temporary users cleanup completed")

@tasks.loop(hours=24)
async def _scheduled_hf_export(self):
"""Daily export of active competition submissions to private HF dataset."""
from libkernelbot.hf_export import export_to_hf, get_active_competition_leaderboards

try:
with self.bot.leaderboard_db as db:
leaderboards = db.get_leaderboards()
active = get_active_competition_leaderboards(
leaderboards,
now=datetime.now(timezone.utc),
)

if not active:
logger.info("HF export: no active competitions, skipping")
return

leaderboard_ids = [lb["id"] for lb in active]
result = export_to_hf(
db=db,
leaderboard_ids=leaderboard_ids,
repo_id=env.HF_PRIVATE_DATASET,
filename="active_submissions.parquet",
token=env.HF_TOKEN,
private=True,
)
logger.info("Scheduled HF export complete: %s", result)
except Exception:
logger.exception("Scheduled HF export failed")

@_scheduled_hf_export.before_loop
async def _before_hf_export(self):
await self.bot.wait_until_ready()

@discord.app_commands.describe(
leaderboard_name="Name of the competition to export",
filename="Parquet filename (default: <leaderboard_name>.parquet)",
private="Upload to private repo (default: true)",
)
@discord.app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete)
@with_error_handling
async def export_to_hf(
self,
interaction: discord.Interaction,
leaderboard_name: str,
filename: Optional[str] = None,
private: bool = True,
):
from libkernelbot.hf_export import export_to_hf as do_export

is_admin = await self.admin_check(interaction)
if not is_admin:
await send_discord_message(
interaction,
"You need to have Admin permissions to run this command",
ephemeral=True,
)
return

if not env.HF_TOKEN:
await send_discord_message(interaction, "HF_TOKEN not configured.", ephemeral=True)
return

await interaction.response.defer(ephemeral=True)

if filename is None:
filename = f"{leaderboard_name}.parquet"
if not filename.endswith(".parquet"):
filename += ".parquet"

repo_id = env.HF_PRIVATE_DATASET if private else env.HF_PUBLIC_DATASET

try:
with self.bot.leaderboard_db as db:
lb_id = db.get_leaderboard_id(leaderboard_name)
result = do_export(
db=db,
leaderboard_ids=[lb_id],
repo_id=repo_id,
filename=filename,
token=env.HF_TOKEN,
private=private,
)
await send_discord_message(
interaction,
f"Exported {result['rows']} rows to `{repo_id}/{filename}`.",
ephemeral=True,
)
except ValueError as e:
await send_discord_message(interaction, str(e), ephemeral=True)
except Exception as e:
logger.error("HF export failed: %s", e, exc_info=True)
await send_discord_message(interaction, f"Export failed: {e}", ephemeral=True)

####################################################################################################################
# MIGRATION COMMANDS --- TO BE DELETED LATER
####################################################################################################################
Expand Down
3 changes: 3 additions & 0 deletions src/kernelbot/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
env.DISCORD_DEBUG_CLUSTER_STAGING_ID = os.getenv("DISCORD_DEBUG_CLUSTER_STAGING_ID")

env.ADMIN_TOKEN = os.getenv("ADMIN_TOKEN")
env.HF_TOKEN = os.getenv("HF_TOKEN")
env.HF_PRIVATE_DATASET = os.getenv("HF_PRIVATE_DATASET", "GPUMODE/kernelbot-data-live")
env.HF_PUBLIC_DATASET = os.getenv("HF_PUBLIC_DATASET", "GPUMODE/kernelbot-data")

# Only required to run the CLI against this instance
# setting these is required only to run the CLI against local instance
Expand Down
Loading