From fd0702b067ac2272faed044b9c2d1f4dce791c09 Mon Sep 17 00:00:00 2001 From: Marcus Messer Date: Thu, 11 Jun 2026 11:33:28 +0100 Subject: [PATCH 1/3] Add support for model aliases and corresponding test coverage --- app/evaluation.py | 15 ++++++++++++--- app/evaluation_tests.py | 18 +++++++++++++++--- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/app/evaluation.py b/app/evaluation.py index 4b84368..73a80ce 100755 --- a/app/evaluation.py +++ b/app/evaluation.py @@ -5,6 +5,13 @@ load_dotenv() +MODEL_ALIASES = { + "small": "gpt-4o-mini", + "medium": "gpt-4o", + "large": "gpt-4.1", + "reasoning": "o4-mini", +} + # A basic way to call ChatGPT from the Lambda Feedback platform @@ -49,6 +56,8 @@ def evaluation_function(response, answer, parameters): openai.api_key = os.environ.get("OPENAI_API_KEY") + model = MODEL_ALIASES.get(parameters['model'], parameters['model']) + question = parameters.get("question") moderator_prompt = parameters.get( "moderator_prompt", @@ -69,7 +78,7 @@ def evaluation_function(response, answer, parameters): # Call openAI API for moderation moderation_boolean = openai.ChatCompletion.create( - model=parameters['model'], + model=model, messages=[{"role": "system", "content": moderator_prompt}, {"role": "user", "content": response}]) @@ -81,7 +90,7 @@ def evaluation_function(response, answer, parameters): # Call openAI API for boolean completion_boolean = openai.ChatCompletion.create( - model=parameters['model'], + model=model, messages=[ {"role": "system", "content": main_prompt + " " + default_prompt}]) @@ -94,7 +103,7 @@ def evaluation_function(response, answer, parameters): # Check if feedback prompt is empty or not. Only populates feedback in 'output' if there is a 'feedback_prompt'. if parameters['feedback_prompt'].strip(): completion_feedback = openai.ChatCompletion.create( - model=parameters['model'], + model=model, messages=[{"role": "system", "content": " The student response has been judged as " + is_correct_str + main_prompt + " " + feedback_prompt + "# Reminder: the student response is "+is_correct_str}]) diff --git a/app/evaluation_tests.py b/app/evaluation_tests.py index 5596d37..164612b 100755 --- a/app/evaluation_tests.py +++ b/app/evaluation_tests.py @@ -6,11 +6,11 @@ load_dotenv() try: - from .evaluation import evaluation_function + from .evaluation import evaluation_function, MODEL_ALIASES except ImportError: - from evaluation import evaluation_function + from evaluation import evaluation_function, MODEL_ALIASES -model = 'gpt-4o-mini' +model = 'small' default_prompt = "Output a Boolean: True if the student is correct and False if the student is incorrect. Be reasonable." feedback_prompt = "Give objective and constructive feedback. Don't give the correct answer away. Short answer # Student reponse: {{response}}. # Closing remark: Keep it short." @@ -90,5 +90,17 @@ def test_physics_definition(self): self.assertEqual(output["is_correct"], True) +class TestModelAliases(unittest.TestCase): + + def test_all_aliases_defined(self): + for name in ('small', 'medium', 'large', 'reasoning'): + self.assertIn(name, MODEL_ALIASES) + self.assertTrue(MODEL_ALIASES[name]) + + def test_raw_model_string_passthrough(self): + raw = 'gpt-4o-mini' + self.assertEqual(MODEL_ALIASES.get(raw, raw), raw) + + if __name__ == "__main__": unittest.main() From 21fab651b52630b8d470b01976b2bcc8e1d2db7b Mon Sep 17 00:00:00 2001 From: Marcus Messer Date: Thu, 11 Jun 2026 13:47:43 +0100 Subject: [PATCH 2/3] Update documentation to introduce model aliases with usage examples --- app/docs/dev.md | 13 ++++++++++--- app/docs/user.md | 9 ++++++++- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/app/docs/dev.md b/app/docs/dev.md index 7e079b4..df6f8cd 100644 --- a/app/docs/dev.md +++ b/app/docs/dev.md @@ -19,7 +19,14 @@ To successfully run this function, ensure you set your OpenAI API key. The code 1. **model**: - Defines the AI model used for evaluation. - - Accepts any OpenAI model string (e.g. `gpt-4o-mini`, `gpt-4o`). Recommended: `gpt-4o-mini`. + - Accepts a simple alias (`small`, `medium`, `large`, `reasoning`) or any raw OpenAI model string (e.g. `gpt-4o-mini`). + + | Alias | Model | + |---|---| + | `small` | `gpt-4o-mini` | + | `medium` | `gpt-4o` | + | `large` | `gpt-4.1` | + | `reasoning` | `o4-mini` | 2. **question** *(optional)*: - The text of the question being answered by the student. @@ -61,7 +68,7 @@ Note that an input of a variable called `answer` is also required. This can be a ```python parameters = { - 'model': 'gpt-4o-mini', + 'model': 'small', 'question': 'What is photosynthesis?', 'main_prompt': "The question asked was: {{question}}. The correct answer is: {{answer}}. Evaluate the student's response: {{response}}.", 'default_prompt': "Output a Boolean: True if the student is correct and False if they are incorrect.", @@ -88,7 +95,7 @@ The function returns a dictionary with the following structure: ```python parameters = { - 'model': 'gpt-4o-mini', + 'model': 'small', 'main_prompt': "Analyze the student's response about the capital of France. The correct answer is {{answer}}.", 'default_prompt': "Output a Boolean: True if the student is correct and False if they are incorrect.", 'feedback_prompt': "You are an AI tutor. Offer constructive feedback." diff --git a/app/docs/user.md b/app/docs/user.md index 8417b77..ae1c3f6 100644 --- a/app/docs/user.md +++ b/app/docs/user.md @@ -5,7 +5,14 @@ This chatGPT evaluation function is designed to automatically evaluate student r ## What does the teacher need to input? - `model` - - Suggest (July 2025), `gpt-4o-mini` or `gpt-4.1-mini`. + - Use a simple alias: `small`, `medium`, `large`, or `reasoning`. You can also pass any raw OpenAI model string directly (e.g. `gpt-4o-mini`). + + | Alias | Model | When to use | + |---|---|---| + | `small` | `gpt-4o-mini` | Fast and cheap; good for most questions | + | `medium` | `gpt-4o` | Better reasoning; use for nuanced marking | + | `large` | `gpt-4.1` | Most capable; use for complex evaluation | + | `reasoning` | `o4-mini` | Structured reasoning; use for multi-step problems | - `question` [optional] - The text of the question being answered. Set this if you want to reference the question wording inside your prompts using `{{question}}`. From 55e1996e10ce7e5a059a812f324dce4d852633ba Mon Sep 17 00:00:00 2001 From: Marcus Messer Date: Thu, 11 Jun 2026 15:33:47 +0100 Subject: [PATCH 3/3] Introduce support for overriding model aliases via parameters, update docs, and adjust tests accordingly --- app/docs/dev.md | 15 ++++++++------- app/docs/user.md | 4 +++- app/evaluation.py | 19 +++++++++++-------- app/evaluation_tests.py | 19 +++++++++++-------- 4 files changed, 33 insertions(+), 24 deletions(-) diff --git a/app/docs/dev.md b/app/docs/dev.md index df6f8cd..be69755 100644 --- a/app/docs/dev.md +++ b/app/docs/dev.md @@ -20,13 +20,14 @@ To successfully run this function, ensure you set your OpenAI API key. The code 1. **model**: - Defines the AI model used for evaluation. - Accepts a simple alias (`small`, `medium`, `large`, `reasoning`) or any raw OpenAI model string (e.g. `gpt-4o-mini`). - - | Alias | Model | - |---|---| - | `small` | `gpt-4o-mini` | - | `medium` | `gpt-4o` | - | `large` | `gpt-4.1` | - | `reasoning` | `o4-mini` | + - Alias targets have defaults but can be overridden per-call via `small_model`, `medium_model`, `large_model`, and `reasoning_model` parameters. + + | Alias | Default model | Override parameter | + |---|---|---| + | `small` | `gpt-4o-mini` | `small_model` | + | `medium` | `gpt-4o` | `medium_model` | + | `large` | `gpt-4.1` | `large_model` | + | `reasoning` | `o4-mini` | `reasoning_model` | 2. **question** *(optional)*: - The text of the question being answered by the student. diff --git a/app/docs/user.md b/app/docs/user.md index ae1c3f6..a257fc8 100644 --- a/app/docs/user.md +++ b/app/docs/user.md @@ -7,13 +7,15 @@ This chatGPT evaluation function is designed to automatically evaluate student r - `model` - Use a simple alias: `small`, `medium`, `large`, or `reasoning`. You can also pass any raw OpenAI model string directly (e.g. `gpt-4o-mini`). - | Alias | Model | When to use | + | Alias | Default model | When to use | |---|---|---| | `small` | `gpt-4o-mini` | Fast and cheap; good for most questions | | `medium` | `gpt-4o` | Better reasoning; use for nuanced marking | | `large` | `gpt-4.1` | Most capable; use for complex evaluation | | `reasoning` | `o4-mini` | Structured reasoning; use for multi-step problems | + To override a default, add the corresponding parameter: `small_model`, `medium_model`, `large_model`, or `reasoning_model`. + - `question` [optional] - The text of the question being answered. Set this if you want to reference the question wording inside your prompts using `{{question}}`. diff --git a/app/evaluation.py b/app/evaluation.py index 73a80ce..589a6f5 100755 --- a/app/evaluation.py +++ b/app/evaluation.py @@ -5,16 +5,19 @@ load_dotenv() -MODEL_ALIASES = { - "small": "gpt-4o-mini", - "medium": "gpt-4o", - "large": "gpt-4.1", - "reasoning": "o4-mini", -} - # A basic way to call ChatGPT from the Lambda Feedback platform +def resolve_model(model_str, parameters): + aliases = { + "small": parameters.get("small_model", "gpt-4o-mini"), + "medium": parameters.get("medium_model", "gpt-4o"), + "large": parameters.get("large_model", "gpt-4.1"), + "reasoning": parameters.get("reasoning_model", "o4-mini"), + } + return aliases.get(model_str, model_str) + + def process_prompt(prompt, question, response, answer): prompt = prompt.replace("{{answer}}", str(answer)) prompt = prompt.replace("{{question}}", str(question) or "") @@ -56,7 +59,7 @@ def evaluation_function(response, answer, parameters): openai.api_key = os.environ.get("OPENAI_API_KEY") - model = MODEL_ALIASES.get(parameters['model'], parameters['model']) + model = resolve_model(parameters['model'], parameters) question = parameters.get("question") moderator_prompt = parameters.get( diff --git a/app/evaluation_tests.py b/app/evaluation_tests.py index 164612b..e8de7a4 100755 --- a/app/evaluation_tests.py +++ b/app/evaluation_tests.py @@ -6,9 +6,9 @@ load_dotenv() try: - from .evaluation import evaluation_function, MODEL_ALIASES + from .evaluation import evaluation_function, resolve_model except ImportError: - from evaluation import evaluation_function, MODEL_ALIASES + from evaluation import evaluation_function, resolve_model model = 'small' @@ -92,14 +92,17 @@ def test_physics_definition(self): class TestModelAliases(unittest.TestCase): - def test_all_aliases_defined(self): - for name in ('small', 'medium', 'large', 'reasoning'): - self.assertIn(name, MODEL_ALIASES) - self.assertTrue(MODEL_ALIASES[name]) + def test_default_aliases(self): + self.assertEqual(resolve_model('small', {}), 'gpt-4o-mini') + self.assertEqual(resolve_model('medium', {}), 'gpt-4o') + self.assertEqual(resolve_model('large', {}), 'gpt-4.1') + self.assertEqual(resolve_model('reasoning', {}), 'o4-mini') + + def test_alias_override_via_parameters(self): + self.assertEqual(resolve_model('small', {'small_model': 'gpt-4.1-nano'}), 'gpt-4.1-nano') def test_raw_model_string_passthrough(self): - raw = 'gpt-4o-mini' - self.assertEqual(MODEL_ALIASES.get(raw, raw), raw) + self.assertEqual(resolve_model('gpt-4o-mini', {}), 'gpt-4o-mini') if __name__ == "__main__":