Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions src/libkernelbot/leaderboard_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -884,6 +884,15 @@ def get_leaderboard_submissions(
AND r.score IS NOT NULL
AND r.passed
AND s.user_id = %s
AND EXISTS (
SELECT 1
FROM leaderboard.runs sr
WHERE sr.submission_id = s.id
AND sr.secret
AND sr.runner = r.runner
AND sr.mode = 'leaderboard'
AND sr.passed
)
AND NOT EXISTS (
SELECT 1
FROM leaderboard.runs sr
Expand Down Expand Up @@ -913,6 +922,15 @@ def get_leaderboard_submissions(
JOIN leaderboard.user_info ui ON s.user_id = ui.id
WHERE l.name = %s AND r.runner = %s AND NOT r.secret
AND r.score IS NOT NULL AND r.passed
AND EXISTS (
SELECT 1
FROM leaderboard.runs sr
WHERE sr.submission_id = s.id
AND sr.secret
AND sr.runner = r.runner
AND sr.mode = 'leaderboard'
AND sr.passed
)
AND NOT EXISTS (
SELECT 1
FROM leaderboard.runs sr
Expand Down Expand Up @@ -1264,6 +1282,15 @@ def get_user_submissions(
WHERE submission_id = ANY(%s)
AND NOT secret
AND passed
AND EXISTS (
SELECT 1
FROM leaderboard.runs sr
WHERE sr.submission_id = r.submission_id
AND sr.secret
AND sr.runner = r.runner
AND sr.mode = 'leaderboard'
AND sr.passed
)
AND NOT EXISTS (
SELECT 1
FROM leaderboard.runs sr
Expand Down Expand Up @@ -1410,6 +1437,15 @@ def get_leaderboard_submission_count(
AND r.score IS NOT NULL
AND r.passed
AND s.user_id = %s
AND EXISTS (
SELECT 1
FROM leaderboard.runs sr
WHERE sr.submission_id = s.id
AND sr.secret
AND sr.runner = r.runner
AND sr.mode = 'leaderboard'
AND sr.passed
)
AND NOT EXISTS (
SELECT 1
FROM leaderboard.runs sr
Expand All @@ -1431,6 +1467,15 @@ def get_leaderboard_submission_count(
AND NOT r.secret
AND r.score IS NOT NULL
AND r.passed
AND EXISTS (
SELECT 1
FROM leaderboard.runs sr
WHERE sr.submission_id = s.id
AND sr.secret
AND sr.runner = r.runner
AND sr.mode = 'leaderboard'
AND sr.passed
)
AND NOT EXISTS (
SELECT 1
FROM leaderboard.runs sr
Expand Down
34 changes: 25 additions & 9 deletions src/libkernelbot/sql/get_hf_export_rows.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,11 @@ WITH ranked AS (
s.code_id,
s.file_name,
s.submission_time,
COALESCE(
sjs.status,
CASE
WHEN s.done AND r.score IS NOT NULL AND r.passed THEN 'succeeded'
WHEN s.done THEN 'failed'
ELSE s.status
END
) as status,
CASE
WHEN s.done AND r.score IS NOT NULL AND r.passed THEN 'succeeded'
WHEN s.done THEN COALESCE(NULLIF(sjs.status, 'succeeded'), 'failed')
ELSE COALESCE(sjs.status, s.status)
END as status,
r.score,
r.passed,
r.mode,
Expand All @@ -30,7 +27,26 @@ WITH ranked AS (
LEFT JOIN leaderboard.user_info u ON s.user_id = u.id
LEFT JOIN leaderboard.submission_job_status sjs ON s.id = sjs.submission_id
LEFT JOIN leaderboard.runs r
ON s.id = r.submission_id AND r.mode = 'leaderboard' AND NOT r.secret
ON s.id = r.submission_id
AND r.mode = 'leaderboard'
AND NOT r.secret
AND EXISTS (
SELECT 1
FROM leaderboard.runs sr
WHERE sr.submission_id = s.id
AND sr.secret
AND sr.runner = r.runner
AND sr.mode = 'leaderboard'
AND sr.passed
)
AND NOT EXISTS (
SELECT 1
FROM leaderboard.runs sr
WHERE sr.submission_id = s.id
AND sr.secret
AND sr.runner = r.runner
AND sr.passed = FALSE
)
LEFT JOIN leaderboard.code_files c ON s.code_id = c.id
WHERE s.leaderboard_id = ANY(%s)
)
Expand Down
9 changes: 7 additions & 2 deletions tests/test_hf_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ def test_query_filters_secret_runs(self):
get_hf_export_rows(db, [763])
sql = db.cursor.execute.call_args[0][0]
assert "NOT r.secret" in sql
assert "sr.mode = 'leaderboard'" in sql
assert "sr.passed = FALSE" in sql

def test_query_partitions_by_runner(self):
db = MagicMock()
Expand All @@ -196,7 +198,7 @@ def test_query_prefers_submission_job_status(self):
get_hf_export_rows(db, [763])
sql = db.cursor.execute.call_args[0][0]
assert "submission_job_status" in sql
assert "COALESCE(" in sql
assert "COALESCE(NULLIF(sjs.status, 'succeeded'), 'failed')" in sql
assert "sjs.status" in sql

def test_query_falls_back_to_derived_status_for_legacy_rows(self):
Expand All @@ -205,7 +207,10 @@ def test_query_falls_back_to_derived_status_for_legacy_rows(self):
get_hf_export_rows(db, [763])
sql = db.cursor.execute.call_args[0][0]
assert "WHEN s.done AND r.score IS NOT NULL AND r.passed THEN 'succeeded'" in sql
assert "WHEN s.done THEN 'failed'" in sql
assert "r.score IS NOT NULL" in sql
assert "r.passed" in sql
assert "sr.mode = 'leaderboard'" in sql
assert "WHEN s.done THEN COALESCE(NULLIF(sjs.status, 'succeeded'), 'failed')" in sql

def test_passes_leaderboard_ids_as_param(self):
db = MagicMock()
Expand Down
47 changes: 46 additions & 1 deletion tests/test_leaderboard_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,9 +287,10 @@ def test_leaderboard_submission_count(database, submit_leaderboard):
_create_submission_run(
db, sub_id, mode="leaderboard", secret=False, runner="A100", score=1.5
)
_create_submission_run(db, sub_id, mode="leaderboard", secret=True, runner="A100")
submission = db.get_submission_by_id(sub_id)

assert len(submission["runs"]) == 3
assert len(submission["runs"]) == 4

db.mark_submission_done(sub_id)
with database as db:
Expand All @@ -313,30 +314,35 @@ def test_leaderboard_submission_ranked(database, submit_leaderboard):
"submit-leaderboard", "submission.py", 5, dangerous_code, submit_time, user_name="user"
)
_create_submission_run(db, sub_id, mode="leaderboard", runner="A100", score=5.5)
_create_submission_run(db, sub_id, mode="leaderboard", secret=True, runner="A100")
db.mark_submission_done(sub_id)

sub_id = db.create_submission(
"submit-leaderboard", "submission.py", 5, dangerous_code, submit_time, user_name="user"
)
_create_submission_run(db, sub_id, mode="leaderboard", runner="A100", score=4.5)
_create_submission_run(db, sub_id, mode="leaderboard", secret=True, runner="A100")
db.mark_submission_done(sub_id)

sub_id = db.create_submission(
"submit-leaderboard", "submission.py", 5, dangerous_code, submit_time, user_name="user"
)
_create_submission_run(db, sub_id, mode="leaderboard", runner="A100", score=5.0)
_create_submission_run(db, sub_id, mode="leaderboard", secret=True, runner="A100")
db.mark_submission_done(sub_id)

sub_id = db.create_submission(
"submit-leaderboard", "submission.py", 6, dangerous_code, submit_time, user_name="user"
)
_create_submission_run(db, sub_id, mode="leaderboard", runner="A100", score=8.0)
_create_submission_run(db, sub_id, mode="leaderboard", secret=True, runner="A100")
db.mark_submission_done(sub_id)

sub_id = db.create_submission(
"submit-leaderboard", "submission.py", 6, dangerous_code, submit_time, user_name="user"
)
_create_submission_run(db, sub_id, mode="leaderboard", runner="H100", score=2.0)
_create_submission_run(db, sub_id, mode="leaderboard", secret=True, runner="H100")
db.mark_submission_done(sub_id)

with database as db:
Expand Down Expand Up @@ -435,6 +441,7 @@ def test_failed_secret_benchmark_hides_public_leaderboard_score(database, submit
)
_create_submission_run(db, valid, mode="leaderboard", runner="A100", score=2.0)
_create_submission_run(db, valid, mode="benchmark", secret=True, runner="A100")
_create_submission_run(db, valid, mode="leaderboard", secret=True, runner="A100")
db.mark_submission_done(valid)

with database as db:
Expand All @@ -444,6 +451,42 @@ def test_failed_secret_benchmark_hides_public_leaderboard_score(database, submit
assert db.get_leaderboard_submission_count("submit-leaderboard", "A100", "5") == 0


def test_missing_secret_leaderboard_run_hides_public_leaderboard_score(
database, submit_leaderboard
):
submit_time = datetime.datetime.now(tz=datetime.timezone.utc)

with database as db:
public_only = db.create_submission(
"submit-leaderboard", "public_only.py", 5, "fast", submit_time, user_name="user5"
)
_create_submission_run(db, public_only, mode="leaderboard", runner="A100", score=1.0)
db.mark_submission_done(public_only)

secret_test_only = db.create_submission(
"submit-leaderboard", "secret_test_only.py", 6, "fast", submit_time, user_name="user6"
)
_create_submission_run(db, secret_test_only, mode="leaderboard", runner="A100", score=1.5)
_create_submission_run(
db, secret_test_only, mode="test", secret=True, runner="A100"
)
db.mark_submission_done(secret_test_only)

valid = db.create_submission(
"submit-leaderboard", "valid.py", 7, "valid", submit_time, user_name="user7"
)
_create_submission_run(db, valid, mode="leaderboard", runner="A100", score=2.0)
_create_submission_run(db, valid, mode="leaderboard", secret=True, runner="A100")
db.mark_submission_done(valid)

with database as db:
ranked = db.get_leaderboard_submissions("submit-leaderboard", "A100")
assert [row["submission_id"] for row in ranked] == [valid]
assert db.get_leaderboard_submission_count("submit-leaderboard", "A100") == 1
assert db.get_leaderboard_submission_count("submit-leaderboard", "A100", "5") == 0
assert db.get_leaderboard_submission_count("submit-leaderboard", "A100", "6") == 0


def test_failed_secret_run_hides_user_submission_scores(database, submit_leaderboard):
submit_time = datetime.datetime.now(tz=datetime.timezone.utc)
failed_secret = dataclasses.replace(sample_run_result(), passed=False)
Expand Down Expand Up @@ -943,7 +986,9 @@ def test_get_user_submissions_with_multiple_runs(database, submit_leaderboard):

# Add multiple runs on different GPUs
_create_submission_run(db, sub1, runner="A100", score=1.5, secret=False)
_create_submission_run(db, sub1, runner="A100", secret=True)
_create_submission_run(db, sub1, runner="H100", score=2.0, secret=False)
_create_submission_run(db, sub1, runner="H100", secret=True)
db.mark_submission_done(sub1)

# Get submissions
Expand Down
Loading