Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions app/docs/dev.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,15 @@ To successfully run this function, ensure you set your OpenAI API key. The code

1. **model**:
- Defines the AI model used for evaluation.
- Accepts any OpenAI model string (e.g. `gpt-4o-mini`, `gpt-4o`). Recommended: `gpt-4o-mini`.
- Accepts a simple alias (`small`, `medium`, `large`, `reasoning`) or any raw OpenAI model string (e.g. `gpt-4o-mini`).
- Alias targets have defaults but can be overridden per-call via `small_model`, `medium_model`, `large_model`, and `reasoning_model` parameters.

| Alias | Default model | Override parameter |
|---|---|---|
| `small` | `gpt-4o-mini` | `small_model` |
| `medium` | `gpt-4o` | `medium_model` |
| `large` | `gpt-4.1` | `large_model` |
| `reasoning` | `o4-mini` | `reasoning_model` |

2. **question** *(optional)*:
- The text of the question being answered by the student.
Expand Down Expand Up @@ -61,7 +69,7 @@ Note that an input of a variable called `answer` is also required. This can be a

```python
parameters = {
'model': 'gpt-4o-mini',
'model': 'small',
'question': 'What is photosynthesis?',
'main_prompt': "The question asked was: {{question}}. The correct answer is: {{answer}}. Evaluate the student's response: {{response}}.",
'default_prompt': "Output a Boolean: True if the student is correct and False if they are incorrect.",
Expand All @@ -88,7 +96,7 @@ The function returns a dictionary with the following structure:

```python
parameters = {
'model': 'gpt-4o-mini',
'model': 'small',
'main_prompt': "Analyze the student's response about the capital of France. The correct answer is {{answer}}.",
'default_prompt': "Output a Boolean: True if the student is correct and False if they are incorrect.",
'feedback_prompt': "You are an AI tutor. Offer constructive feedback."
Expand Down
11 changes: 10 additions & 1 deletion app/docs/user.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,16 @@ This chatGPT evaluation function is designed to automatically evaluate student r

## What does the teacher need to input?
- `model`
- Suggest (July 2025), `gpt-4o-mini` or `gpt-4.1-mini`.
- Use a simple alias: `small`, `medium`, `large`, or `reasoning`. You can also pass any raw OpenAI model string directly (e.g. `gpt-4o-mini`).

| Alias | Default model | When to use |
|---|---|---|
| `small` | `gpt-4o-mini` | Fast and cheap; good for most questions |
| `medium` | `gpt-4o` | Better reasoning; use for nuanced marking |
| `large` | `gpt-4.1` | Most capable; use for complex evaluation |
| `reasoning` | `o4-mini` | Structured reasoning; use for multi-step problems |

To override a default, add the corresponding parameter: `small_model`, `medium_model`, `large_model`, or `reasoning_model`.

- `question` [optional]
- The text of the question being answered. Set this if you want to reference the question wording inside your prompts using `{{question}}`.
Expand Down
18 changes: 15 additions & 3 deletions app/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,16 @@
# A basic way to call ChatGPT from the Lambda Feedback platform


def resolve_model(model_str, parameters):
aliases = {
"small": parameters.get("small_model", "gpt-4o-mini"),
"medium": parameters.get("medium_model", "gpt-4o"),
"large": parameters.get("large_model", "gpt-4.1"),
"reasoning": parameters.get("reasoning_model", "o4-mini"),
}
return aliases.get(model_str, model_str)


def process_prompt(prompt, question, response, answer):
prompt = prompt.replace("{{answer}}", str(answer))
prompt = prompt.replace("{{question}}", str(question) or "")
Expand Down Expand Up @@ -49,6 +59,8 @@ def evaluation_function(response, answer, parameters):

openai.api_key = os.environ.get("OPENAI_API_KEY")

model = resolve_model(parameters['model'], parameters)

question = parameters.get("question")
moderator_prompt = parameters.get(
"moderator_prompt",
Expand All @@ -69,7 +81,7 @@ def evaluation_function(response, answer, parameters):

# Call openAI API for moderation
moderation_boolean = openai.ChatCompletion.create(
model=parameters['model'],
model=model,
messages=[{"role": "system", "content": moderator_prompt},
{"role": "user", "content": response}])

Expand All @@ -81,7 +93,7 @@ def evaluation_function(response, answer, parameters):

# Call openAI API for boolean
completion_boolean = openai.ChatCompletion.create(
model=parameters['model'],
model=model,
messages=[
{"role": "system", "content": main_prompt + " " + default_prompt}])

Expand All @@ -94,7 +106,7 @@ def evaluation_function(response, answer, parameters):
# Check if feedback prompt is empty or not. Only populates feedback in 'output' if there is a 'feedback_prompt'.
if parameters['feedback_prompt'].strip():
completion_feedback = openai.ChatCompletion.create(
model=parameters['model'],
model=model,
messages=[{"role": "system", "content": " The student response has been judged as " +
is_correct_str + main_prompt + " " + feedback_prompt + "# Reminder: the student response is "+is_correct_str}])

Expand Down
21 changes: 18 additions & 3 deletions app/evaluation_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
load_dotenv()

try:
from .evaluation import evaluation_function
from .evaluation import evaluation_function, resolve_model
except ImportError:
from evaluation import evaluation_function
from evaluation import evaluation_function, resolve_model

model = 'gpt-4o-mini'
model = 'small'

default_prompt = "Output a Boolean: True if the student is correct and False if the student is incorrect. Be reasonable."
feedback_prompt = "Give objective and constructive feedback. Don't give the correct answer away. Short answer # Student reponse: {{response}}. # Closing remark: Keep it short."
Expand Down Expand Up @@ -90,5 +90,20 @@ def test_physics_definition(self):
self.assertEqual(output["is_correct"], True)


class TestModelAliases(unittest.TestCase):

def test_default_aliases(self):
self.assertEqual(resolve_model('small', {}), 'gpt-4o-mini')
self.assertEqual(resolve_model('medium', {}), 'gpt-4o')
self.assertEqual(resolve_model('large', {}), 'gpt-4.1')
self.assertEqual(resolve_model('reasoning', {}), 'o4-mini')

def test_alias_override_via_parameters(self):
self.assertEqual(resolve_model('small', {'small_model': 'gpt-4.1-nano'}), 'gpt-4.1-nano')

def test_raw_model_string_passthrough(self):
self.assertEqual(resolve_model('gpt-4o-mini', {}), 'gpt-4o-mini')


if __name__ == "__main__":
unittest.main()