From 3ef4619523573fc82fcc37421b558e251d2cd282 Mon Sep 17 00:00:00 2001 From: Sinatras Date: Thu, 18 Jun 2026 02:33:08 +0300 Subject: [PATCH] Require secret leaderboard pass for public scores --- src/libkernelbot/leaderboard_db.py | 45 ++++++++++++++++++++ src/libkernelbot/sql/get_hf_export_rows.sql | 34 +++++++++++---- tests/test_hf_export.py | 9 +++- tests/test_leaderboard_db.py | 47 ++++++++++++++++++++- 4 files changed, 123 insertions(+), 12 deletions(-) diff --git a/src/libkernelbot/leaderboard_db.py b/src/libkernelbot/leaderboard_db.py index 6f1413e7c..6169441ed 100644 --- a/src/libkernelbot/leaderboard_db.py +++ b/src/libkernelbot/leaderboard_db.py @@ -884,6 +884,15 @@ def get_leaderboard_submissions( AND r.score IS NOT NULL AND r.passed AND s.user_id = %s + AND EXISTS ( + SELECT 1 + FROM leaderboard.runs sr + WHERE sr.submission_id = s.id + AND sr.secret + AND sr.runner = r.runner + AND sr.mode = 'leaderboard' + AND sr.passed + ) AND NOT EXISTS ( SELECT 1 FROM leaderboard.runs sr @@ -913,6 +922,15 @@ def get_leaderboard_submissions( JOIN leaderboard.user_info ui ON s.user_id = ui.id WHERE l.name = %s AND r.runner = %s AND NOT r.secret AND r.score IS NOT NULL AND r.passed + AND EXISTS ( + SELECT 1 + FROM leaderboard.runs sr + WHERE sr.submission_id = s.id + AND sr.secret + AND sr.runner = r.runner + AND sr.mode = 'leaderboard' + AND sr.passed + ) AND NOT EXISTS ( SELECT 1 FROM leaderboard.runs sr @@ -1264,6 +1282,15 @@ def get_user_submissions( WHERE submission_id = ANY(%s) AND NOT secret AND passed + AND EXISTS ( + SELECT 1 + FROM leaderboard.runs sr + WHERE sr.submission_id = r.submission_id + AND sr.secret + AND sr.runner = r.runner + AND sr.mode = 'leaderboard' + AND sr.passed + ) AND NOT EXISTS ( SELECT 1 FROM leaderboard.runs sr @@ -1410,6 +1437,15 @@ def get_leaderboard_submission_count( AND r.score IS NOT NULL AND r.passed AND s.user_id = %s + AND EXISTS ( + SELECT 1 + FROM leaderboard.runs sr + WHERE sr.submission_id = s.id + AND sr.secret + AND sr.runner = r.runner + AND sr.mode = 'leaderboard' + AND sr.passed + ) AND NOT EXISTS ( SELECT 1 FROM leaderboard.runs sr @@ -1431,6 +1467,15 @@ def get_leaderboard_submission_count( AND NOT r.secret AND r.score IS NOT NULL AND r.passed + AND EXISTS ( + SELECT 1 + FROM leaderboard.runs sr + WHERE sr.submission_id = s.id + AND sr.secret + AND sr.runner = r.runner + AND sr.mode = 'leaderboard' + AND sr.passed + ) AND NOT EXISTS ( SELECT 1 FROM leaderboard.runs sr diff --git a/src/libkernelbot/sql/get_hf_export_rows.sql b/src/libkernelbot/sql/get_hf_export_rows.sql index 8f4d35377..26f6f5164 100644 --- a/src/libkernelbot/sql/get_hf_export_rows.sql +++ b/src/libkernelbot/sql/get_hf_export_rows.sql @@ -8,14 +8,11 @@ WITH ranked AS ( s.code_id, s.file_name, s.submission_time, - COALESCE( - sjs.status, - CASE - WHEN s.done AND r.score IS NOT NULL AND r.passed THEN 'succeeded' - WHEN s.done THEN 'failed' - ELSE s.status - END - ) as status, + CASE + WHEN s.done AND r.score IS NOT NULL AND r.passed THEN 'succeeded' + WHEN s.done THEN COALESCE(NULLIF(sjs.status, 'succeeded'), 'failed') + ELSE COALESCE(sjs.status, s.status) + END as status, r.score, r.passed, r.mode, @@ -30,7 +27,26 @@ WITH ranked AS ( LEFT JOIN leaderboard.user_info u ON s.user_id = u.id LEFT JOIN leaderboard.submission_job_status sjs ON s.id = sjs.submission_id LEFT JOIN leaderboard.runs r - ON s.id = r.submission_id AND r.mode = 'leaderboard' AND NOT r.secret + ON s.id = r.submission_id + AND r.mode = 'leaderboard' + AND NOT r.secret + AND EXISTS ( + SELECT 1 + FROM leaderboard.runs sr + WHERE sr.submission_id = s.id + AND sr.secret + AND sr.runner = r.runner + AND sr.mode = 'leaderboard' + AND sr.passed + ) + AND NOT EXISTS ( + SELECT 1 + FROM leaderboard.runs sr + WHERE sr.submission_id = s.id + AND sr.secret + AND sr.runner = r.runner + AND sr.passed = FALSE + ) LEFT JOIN leaderboard.code_files c ON s.code_id = c.id WHERE s.leaderboard_id = ANY(%s) ) diff --git a/tests/test_hf_export.py b/tests/test_hf_export.py index 83708e1ec..72ae4f1ed 100644 --- a/tests/test_hf_export.py +++ b/tests/test_hf_export.py @@ -181,6 +181,8 @@ def test_query_filters_secret_runs(self): get_hf_export_rows(db, [763]) sql = db.cursor.execute.call_args[0][0] assert "NOT r.secret" in sql + assert "sr.mode = 'leaderboard'" in sql + assert "sr.passed = FALSE" in sql def test_query_partitions_by_runner(self): db = MagicMock() @@ -196,7 +198,7 @@ def test_query_prefers_submission_job_status(self): get_hf_export_rows(db, [763]) sql = db.cursor.execute.call_args[0][0] assert "submission_job_status" in sql - assert "COALESCE(" in sql + assert "COALESCE(NULLIF(sjs.status, 'succeeded'), 'failed')" in sql assert "sjs.status" in sql def test_query_falls_back_to_derived_status_for_legacy_rows(self): @@ -205,7 +207,10 @@ def test_query_falls_back_to_derived_status_for_legacy_rows(self): get_hf_export_rows(db, [763]) sql = db.cursor.execute.call_args[0][0] assert "WHEN s.done AND r.score IS NOT NULL AND r.passed THEN 'succeeded'" in sql - assert "WHEN s.done THEN 'failed'" in sql + assert "r.score IS NOT NULL" in sql + assert "r.passed" in sql + assert "sr.mode = 'leaderboard'" in sql + assert "WHEN s.done THEN COALESCE(NULLIF(sjs.status, 'succeeded'), 'failed')" in sql def test_passes_leaderboard_ids_as_param(self): db = MagicMock() diff --git a/tests/test_leaderboard_db.py b/tests/test_leaderboard_db.py index 1da4a5223..986ca919b 100644 --- a/tests/test_leaderboard_db.py +++ b/tests/test_leaderboard_db.py @@ -287,9 +287,10 @@ def test_leaderboard_submission_count(database, submit_leaderboard): _create_submission_run( db, sub_id, mode="leaderboard", secret=False, runner="A100", score=1.5 ) + _create_submission_run(db, sub_id, mode="leaderboard", secret=True, runner="A100") submission = db.get_submission_by_id(sub_id) - assert len(submission["runs"]) == 3 + assert len(submission["runs"]) == 4 db.mark_submission_done(sub_id) with database as db: @@ -313,30 +314,35 @@ def test_leaderboard_submission_ranked(database, submit_leaderboard): "submit-leaderboard", "submission.py", 5, dangerous_code, submit_time, user_name="user" ) _create_submission_run(db, sub_id, mode="leaderboard", runner="A100", score=5.5) + _create_submission_run(db, sub_id, mode="leaderboard", secret=True, runner="A100") db.mark_submission_done(sub_id) sub_id = db.create_submission( "submit-leaderboard", "submission.py", 5, dangerous_code, submit_time, user_name="user" ) _create_submission_run(db, sub_id, mode="leaderboard", runner="A100", score=4.5) + _create_submission_run(db, sub_id, mode="leaderboard", secret=True, runner="A100") db.mark_submission_done(sub_id) sub_id = db.create_submission( "submit-leaderboard", "submission.py", 5, dangerous_code, submit_time, user_name="user" ) _create_submission_run(db, sub_id, mode="leaderboard", runner="A100", score=5.0) + _create_submission_run(db, sub_id, mode="leaderboard", secret=True, runner="A100") db.mark_submission_done(sub_id) sub_id = db.create_submission( "submit-leaderboard", "submission.py", 6, dangerous_code, submit_time, user_name="user" ) _create_submission_run(db, sub_id, mode="leaderboard", runner="A100", score=8.0) + _create_submission_run(db, sub_id, mode="leaderboard", secret=True, runner="A100") db.mark_submission_done(sub_id) sub_id = db.create_submission( "submit-leaderboard", "submission.py", 6, dangerous_code, submit_time, user_name="user" ) _create_submission_run(db, sub_id, mode="leaderboard", runner="H100", score=2.0) + _create_submission_run(db, sub_id, mode="leaderboard", secret=True, runner="H100") db.mark_submission_done(sub_id) with database as db: @@ -435,6 +441,7 @@ def test_failed_secret_benchmark_hides_public_leaderboard_score(database, submit ) _create_submission_run(db, valid, mode="leaderboard", runner="A100", score=2.0) _create_submission_run(db, valid, mode="benchmark", secret=True, runner="A100") + _create_submission_run(db, valid, mode="leaderboard", secret=True, runner="A100") db.mark_submission_done(valid) with database as db: @@ -444,6 +451,42 @@ def test_failed_secret_benchmark_hides_public_leaderboard_score(database, submit assert db.get_leaderboard_submission_count("submit-leaderboard", "A100", "5") == 0 +def test_missing_secret_leaderboard_run_hides_public_leaderboard_score( + database, submit_leaderboard +): + submit_time = datetime.datetime.now(tz=datetime.timezone.utc) + + with database as db: + public_only = db.create_submission( + "submit-leaderboard", "public_only.py", 5, "fast", submit_time, user_name="user5" + ) + _create_submission_run(db, public_only, mode="leaderboard", runner="A100", score=1.0) + db.mark_submission_done(public_only) + + secret_test_only = db.create_submission( + "submit-leaderboard", "secret_test_only.py", 6, "fast", submit_time, user_name="user6" + ) + _create_submission_run(db, secret_test_only, mode="leaderboard", runner="A100", score=1.5) + _create_submission_run( + db, secret_test_only, mode="test", secret=True, runner="A100" + ) + db.mark_submission_done(secret_test_only) + + valid = db.create_submission( + "submit-leaderboard", "valid.py", 7, "valid", submit_time, user_name="user7" + ) + _create_submission_run(db, valid, mode="leaderboard", runner="A100", score=2.0) + _create_submission_run(db, valid, mode="leaderboard", secret=True, runner="A100") + db.mark_submission_done(valid) + + with database as db: + ranked = db.get_leaderboard_submissions("submit-leaderboard", "A100") + assert [row["submission_id"] for row in ranked] == [valid] + assert db.get_leaderboard_submission_count("submit-leaderboard", "A100") == 1 + assert db.get_leaderboard_submission_count("submit-leaderboard", "A100", "5") == 0 + assert db.get_leaderboard_submission_count("submit-leaderboard", "A100", "6") == 0 + + def test_failed_secret_run_hides_user_submission_scores(database, submit_leaderboard): submit_time = datetime.datetime.now(tz=datetime.timezone.utc) failed_secret = dataclasses.replace(sample_run_result(), passed=False) @@ -943,7 +986,9 @@ def test_get_user_submissions_with_multiple_runs(database, submit_leaderboard): # Add multiple runs on different GPUs _create_submission_run(db, sub1, runner="A100", score=1.5, secret=False) + _create_submission_run(db, sub1, runner="A100", secret=True) _create_submission_run(db, sub1, runner="H100", score=2.0, secret=False) + _create_submission_run(db, sub1, runner="H100", secret=True) db.mark_submission_done(sub1) # Get submissions