mirror of
https://github.com/anthropics/claude-cookbooks.git
synced 2025-10-06 01:00:28 +03:00
76831 lines
2.6 MiB
76831 lines
2.6 MiB
{
|
|
"evalId": "eval-2024-07-08T18:39:48",
|
|
"results": {
|
|
"version": 2,
|
|
"timestamp": "2024-07-08T18:39:48.303Z",
|
|
"results": [
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you create multiple test cases for an evaluation in the Anthropic Evaluation tool?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you create multiple test cases for an evaluation in the Anthropic Evaluation tool?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#creating-test-cases\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#building-evals-and-test-cases\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#creating-test-cases",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#2-develop-your-test-cases",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 96901,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What embeddings provider does Anthropic recommend for customized domain-specific models, and what capabilities does this provider offer?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What embeddings provider does Anthropic recommend for customized domain-specific models, and what capabilities does this provider offer?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#before-implementing-embeddings\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#before-implementing-embeddings",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1011,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What embeddings provider does Anthropic recommend for customized domain-specific models, and what capabilities does this provider offer?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What embeddings provider does Anthropic recommend for customized domain-specific models, and what capabilities does this provider offer?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#before-implementing-embeddings\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#before-implementing-embeddings",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are some key success metrics to consider when evaluating Claude's performance on a classification task, and how do they relate to choosing the right model to reduce latency?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are some key success metrics to consider when evaluating Claude's performance on a classification task, and how do they relate to choosing the right model to reduce latency?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#choosing-the-right-model"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 957,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are some key success metrics to consider when evaluating Claude's performance on a classification task, and how do they relate to choosing the right model to reduce latency?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are some key success metrics to consider when evaluating Claude's performance on a classification task, and how do they relate to choosing the right model to reduce latency?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#choosing-the-right-model"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are some key success metrics to consider when evaluating Claude's performance on a classification task, and how do they relate to choosing the right model to reduce latency?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are some key success metrics to consider when evaluating Claude's performance on a classification task, and how do they relate to choosing the right model to reduce latency?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#choosing-the-right-model"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two ways that Claude for Sheets can improve prompt engineering workflows compared to using chained prompts?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two ways that Claude for Sheets can improve prompt engineering workflows compared to using chained prompts?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#why-use-claude-for-sheets\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#why-use-claude-for-sheets",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 975,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two ways that Claude for Sheets can improve prompt engineering workflows compared to using chained prompts?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two ways that Claude for Sheets can improve prompt engineering workflows compared to using chained prompts?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#why-use-claude-for-sheets\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#why-use-claude-for-sheets",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two ways that Claude for Sheets can improve prompt engineering workflows compared to using chained prompts?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two ways that Claude for Sheets can improve prompt engineering workflows compared to using chained prompts?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#why-use-claude-for-sheets\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#why-use-claude-for-sheets",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What happens if a prompt for the Text Completions API is missing the \"\\n\\nHuman:\" and \"\\n\\nAssistant:\" turns?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What happens if a prompt for the Text Completions API is missing the \"\\n\\nHuman:\" and \"\\n\\nAssistant:\" turns?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt\",\"https://docs.claude.com/en/api/prompt-validation#examples\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/prompt-validation#examples",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1007,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What happens if a prompt for the Text Completions API is missing the \"\\n\\nHuman:\" and \"\\n\\nAssistant:\" turns?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What happens if a prompt for the Text Completions API is missing the \"\\n\\nHuman:\" and \"\\n\\nAssistant:\" turns?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt\",\"https://docs.claude.com/en/api/prompt-validation#examples\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/prompt-validation#examples",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What happens if a prompt for the Text Completions API is missing the \"\\n\\nHuman:\" and \"\\n\\nAssistant:\" turns?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What happens if a prompt for the Text Completions API is missing the \"\\n\\nHuman:\" and \"\\n\\nAssistant:\" turns?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt\",\"https://docs.claude.com/en/api/prompt-validation#examples\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/prompt-validation#examples",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How do the additional tokens required for tool use in Claude API requests impact pricing compared to regular API requests?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How do the additional tokens required for tool use in Claude API requests impact pricing compared to regular API requests?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#pricing\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#pricing",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 997,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How do the additional tokens required for tool use in Claude API requests impact pricing compared to regular API requests?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How do the additional tokens required for tool use in Claude API requests impact pricing compared to regular API requests?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#pricing\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#pricing",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How do the additional tokens required for tool use in Claude API requests impact pricing compared to regular API requests?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How do the additional tokens required for tool use in Claude API requests impact pricing compared to regular API requests?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#pricing\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#pricing",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "When will the new Anthropic Developer Console features that show API usage, billing details, and rate limits be available?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When will the new Anthropic Developer Console features that show API usage, billing details, and rate limits be available?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#june-27th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/release-notes/api#june-27th-2024",
|
|
"https://docs.claude.com/en/release-notes/api#may-30th-2024",
|
|
"https://docs.claude.com/en/api/rate-limits#about-our-limits"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1030,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "When will the new Anthropic Developer Console features that show API usage, billing details, and rate limits be available?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When will the new Anthropic Developer Console features that show API usage, billing details, and rate limits be available?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#june-27th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/release-notes/api#june-27th-2024",
|
|
"https://docs.claude.com/en/release-notes/api#may-30th-2024",
|
|
"https://docs.claude.com/en/api/rate-limits#about-our-limits"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "When will the new Anthropic Developer Console features that show API usage, billing details, and rate limits be available?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When will the new Anthropic Developer Console features that show API usage, billing details, and rate limits be available?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#june-27th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/release-notes/api#june-27th-2024",
|
|
"https://docs.claude.com/en/release-notes/api#may-30th-2024",
|
|
"https://docs.claude.com/en/api/rate-limits#about-our-limits"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "When deciding whether to use chain-of-thought (CoT) for a task, what are two key factors to consider in order to strike the right balance between performance and latency?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When deciding whether to use chain-of-thought (CoT) for a task, what are two key factors to consider in order to strike the right balance between performance and latency?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#why-not-let-claude-think\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#before-implementing-cot\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#why-not-let-claude-think",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-measure-latency",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#before-implementing-cot"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 992,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "When deciding whether to use chain-of-thought (CoT) for a task, what are two key factors to consider in order to strike the right balance between performance and latency?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When deciding whether to use chain-of-thought (CoT) for a task, what are two key factors to consider in order to strike the right balance between performance and latency?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#why-not-let-claude-think\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#before-implementing-cot\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#why-not-let-claude-think",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-measure-latency",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#before-implementing-cot"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "When deciding whether to use chain-of-thought (CoT) for a task, what are two key factors to consider in order to strike the right balance between performance and latency?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When deciding whether to use chain-of-thought (CoT) for a task, what are two key factors to consider in order to strike the right balance between performance and latency?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#why-not-let-claude-think\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#before-implementing-cot\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#why-not-let-claude-think",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-measure-latency",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#before-implementing-cot"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you create multiple test cases for an evaluation in the Anthropic Evaluation tool?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you create multiple test cases for an evaluation in the Anthropic Evaluation tool?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#creating-test-cases\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#building-evals-and-test-cases\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results",
|
|
"https://docs.claude.com/en/docs/build-with-claude/develop-tests#building-evals-and-test-cases",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#2-develop-your-test-cases"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 106010,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can I use Claude to more easily digest the content of long PDF documents?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can I use Claude to more easily digest the content of long PDF documents?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook\",\"https://docs.claude.com/en/docs/build-with-claude/vision#before-you-upload\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#controlling-claudes-output",
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#text-capabilities-and-use-cases",
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1226,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can I use Claude to more easily digest the content of long PDF documents?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can I use Claude to more easily digest the content of long PDF documents?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook\",\"https://docs.claude.com/en/docs/build-with-claude/vision#before-you-upload\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#controlling-claudes-output",
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#text-capabilities-and-use-cases",
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can I use Claude to more easily digest the content of long PDF documents?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can I use Claude to more easily digest the content of long PDF documents?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook\",\"https://docs.claude.com/en/docs/build-with-claude/vision#before-you-upload\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#text-capabilities-and-use-cases",
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#what-you-can-do-with-claude"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 928,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you create multiple test cases for an evaluation in the Anthropic Evaluation tool?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you create multiple test cases for an evaluation in the Anthropic Evaluation tool?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#creating-test-cases\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#building-evals-and-test-cases\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#creating-test-cases",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#accessing-the-evaluate-feature",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#2-develop-your-test-cases"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 107527,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "According to the documentation, where can you view your organization's current API rate limits in the Claude Console?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "According to the documentation, where can you view your organization's current API rate limits in the Claude Console?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/rate-limits#about-our-limits\",\"https://docs.claude.com/en/release-notes/api#june-27th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/rate-limits#about-our-limits",
|
|
"https://docs.claude.com/en/api/rate-limits#response-headers",
|
|
"https://docs.claude.com/en/release-notes/api#june-27th-2024"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1543,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "According to the documentation, where can you view your organization's current API rate limits in the Claude Console?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "According to the documentation, where can you view your organization's current API rate limits in the Claude Console?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/rate-limits#about-our-limits\",\"https://docs.claude.com/en/release-notes/api#june-27th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/rate-limits#response-headers",
|
|
"https://docs.claude.com/en/api/rate-limits#about-our-limits",
|
|
"https://docs.claude.com/en/release-notes/api#june-27th-2024"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1520,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can we measure the performance of the ticket classification system implemented using Claude beyond just accuracy?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can we measure the performance of the ticket classification system implemented using Claude beyond just accuracy?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#prompting-claude-for-ticket-routing\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#advantages-of-using-claude",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#deploy-your-classifier"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1105,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can we measure the performance of the ticket classification system implemented using Claude beyond just accuracy?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can we measure the performance of the ticket classification system implemented using Claude beyond just accuracy?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#prompting-claude-for-ticket-routing\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#advantages-of-using-claude",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1227,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can we measure the performance of the ticket classification system implemented using Claude beyond just accuracy?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can we measure the performance of the ticket classification system implemented using Claude beyond just accuracy?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#prompting-claude-for-ticket-routing\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#advantages-of-using-claude",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "According to the documentation, where can you view your organization's current API rate limits in the Claude Console?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "According to the documentation, where can you view your organization's current API rate limits in the Claude Console?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/rate-limits#about-our-limits\",\"https://docs.claude.com/en/release-notes/api#june-27th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/release-notes/api#june-27th-2024",
|
|
"https://docs.claude.com/en/api/rate-limits#about-our-limits",
|
|
"https://docs.claude.com/en/api/rate-limits#rate-limits"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 2784,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What embeddings provider does Anthropic recommend for customized domain-specific models, and what capabilities does this provider offer?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What embeddings provider does Anthropic recommend for customized domain-specific models, and what capabilities does this provider offer?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#before-implementing-embeddings\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#before-implementing-embeddings",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 110531,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you specify a system prompt using the Text Completions API versus the Messages API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you specify a system prompt using the Text Completions API versus the Messages API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/prompt-validation#examples\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1816,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you combine XML tags with chain of thought reasoning to create high-performance prompts for Claude?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you combine XML tags with chain of thought reasoning to create high-performance prompts for Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#tagging-best-practices\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#chain-of-thought\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#why-use-xml-tags",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1235,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you specify a system prompt using the Text Completions API versus the Messages API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you specify a system prompt using the Text Completions API versus the Messages API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/prompt-validation#examples\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1828,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you combine XML tags with chain of thought reasoning to create high-performance prompts for Claude?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you combine XML tags with chain of thought reasoning to create high-performance prompts for Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#tagging-best-practices\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#chain-of-thought\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#why-use-xml-tags",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you combine XML tags with chain of thought reasoning to create high-performance prompts for Claude?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you combine XML tags with chain of thought reasoning to create high-performance prompts for Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#tagging-best-practices\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#chain-of-thought\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#why-use-xml-tags",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "When evaluating the Claude model's performance for ticket routing, what three key metrics are calculated and what are the results for the claude-3-haiku-20240307 model on the 91 test samples?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When evaluating the Claude model's performance for ticket routing, what three key metrics are calculated and what are the results for the claude-3-haiku-20240307 model on the 91 test samples?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#example-data\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#choosing-the-right-model",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1304,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "When evaluating the Claude model's performance for ticket routing, what three key metrics are calculated and what are the results for the claude-3-haiku-20240307 model on the 91 test samples?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When evaluating the Claude model's performance for ticket routing, what three key metrics are calculated and what are the results for the claude-3-haiku-20240307 model on the 91 test samples?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#example-data\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#choosing-the-right-model"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1287,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you specify a system prompt using the Text Completions API versus the Messages API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you specify a system prompt using the Text Completions API versus the Messages API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/prompt-validation#examples\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 3415,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "Before starting to engineer and improve a prompt in Claude, what key things does Anthropic recommend you have in place first?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "Before starting to engineer and improve a prompt in Claude, what key things does Anthropic recommend you have in place first?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/define-success#next-steps\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#before-prompt-engineering\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/intro-to-claude#implementing-claude",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#start-building-with-claude"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1196,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "Before starting to engineer and improve a prompt in Claude, what key things does Anthropic recommend you have in place first?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "Before starting to engineer and improve a prompt in Claude, what key things does Anthropic recommend you have in place first?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/define-success#next-steps\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#before-prompt-engineering\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow",
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#prompt-examples",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#before-prompt-engineering"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1234,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does the Messages API handle mid-response prompting compared to the Text Completions API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does the Messages API handle mid-response prompting compared to the Text Completions API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1181,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does the Messages API handle mid-response prompting compared to the Text Completions API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does the Messages API handle mid-response prompting compared to the Text Completions API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1196,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does the Messages API handle mid-response prompting compared to the Text Completions API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does the Messages API handle mid-response prompting compared to the Text Completions API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "When evaluating the Claude model's performance for ticket routing, what three key metrics are calculated and what are the results for the claude-3-haiku-20240307 model on the 91 test samples?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When evaluating the Claude model's performance for ticket routing, what three key metrics are calculated and what are the results for the claude-3-haiku-20240307 model on the 91 test samples?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#example-data\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#choosing-the-right-model"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 4007,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does Claude's response differ when given a role through a system prompt compared to not having a specific role in the financial analysis example?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does Claude's response differ when given a role through a system prompt compared to not having a specific role in the financial analysis example?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-2-financial-analysis\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#why-use-role-prompting",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-2-financial-analysis",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1442,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "Before starting to engineer and improve a prompt in Claude, what key things does Anthropic recommend you have in place first?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "Before starting to engineer and improve a prompt in Claude, what key things does Anthropic recommend you have in place first?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/define-success#next-steps\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#before-prompt-engineering\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#before-prompt-engineering",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#iterating-your-prompt-for-better-performance"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 3360,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does Claude's response differ when given a role through a system prompt compared to not having a specific role in the financial analysis example?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does Claude's response differ when given a role through a system prompt compared to not having a specific role in the financial analysis example?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-2-financial-analysis\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#why-use-role-prompting",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-2-financial-analysis",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1452,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are some quantitative metrics that can be used to measure the success of a sentiment analysis model, and how might specific targets for those metrics be determined?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are some quantitative metrics that can be used to measure the success of a sentiment analysis model, and how might specific targets for those metrics be determined?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/define-success#building-strong-criteria\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/define-success#common-success-criteria-to-consider",
|
|
"https://docs.claude.com/en/docs/build-with-claude/define-success#building-strong-criteria",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1256,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is a power user tip mentioned in the documentation for creating high-performance prompts using XML tags?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is a power user tip mentioned in the documentation for creating high-performance prompts using XML tags?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#tagging-best-practices\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#tagging-best-practices",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#why-use-xml-tags",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1011,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is a power user tip mentioned in the documentation for creating high-performance prompts using XML tags?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is a power user tip mentioned in the documentation for creating high-performance prompts using XML tags?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#tagging-best-practices\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#tagging-best-practices",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#why-use-xml-tags",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is a power user tip mentioned in the documentation for creating high-performance prompts using XML tags?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is a power user tip mentioned in the documentation for creating high-performance prompts using XML tags?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#tagging-best-practices\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#tagging-best-practices",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#why-use-xml-tags",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does Claude's response differ when given a role through a system prompt compared to not having a specific role in the financial analysis example?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does Claude's response differ when given a role through a system prompt compared to not having a specific role in the financial analysis example?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-2-financial-analysis\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-2-financial-analysis",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#how-to-give-claude-a-role",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#why-use-role-prompting"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 3789,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you use an LLM like Claude to automatically grade the outputs of other LLMs based on a rubric?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you use an LLM like Claude to automatically grade the outputs of other LLMs based on a rubric?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#tips-for-llm-based-grading\",\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/develop-tests#tips-for-llm-based-grading",
|
|
"https://docs.claude.com/en/docs/build-with-claude/develop-tests#grading-evals",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#when-to-use-claude-for-classification"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1237,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you use an LLM like Claude to automatically grade the outputs of other LLMs based on a rubric?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you use an LLM like Claude to automatically grade the outputs of other LLMs based on a rubric?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#tips-for-llm-based-grading\",\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#when-to-use-claude-for-classification",
|
|
"https://docs.claude.com/en/docs/build-with-claude/develop-tests#grading-evals",
|
|
"https://docs.claude.com/en/docs/build-with-claude/develop-tests#tips-for-llm-based-grading"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1277,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you use an LLM like Claude to automatically grade the outputs of other LLMs based on a rubric?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you use an LLM like Claude to automatically grade the outputs of other LLMs based on a rubric?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#tips-for-llm-based-grading\",\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#when-to-use-claude-for-classification",
|
|
"https://docs.claude.com/en/docs/build-with-claude/develop-tests#grading-evals",
|
|
"https://docs.claude.com/en/docs/build-with-claude/develop-tests#tips-for-llm-based-grading"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you access and deploy Voyage embeddings on AWS Marketplace?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you access and deploy Voyage embeddings on AWS Marketplace?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-on-the-aws-marketplace\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-on-the-aws-marketplace",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#getting-started-with-voyage-ai",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#pricing"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1145,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you access and deploy Voyage embeddings on AWS Marketplace?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you access and deploy Voyage embeddings on AWS Marketplace?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-on-the-aws-marketplace\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-on-the-aws-marketplace",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#getting-started-with-voyage-ai",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#pricing"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1087,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you access and deploy Voyage embeddings on AWS Marketplace?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you access and deploy Voyage embeddings on AWS Marketplace?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-on-the-aws-marketplace\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-on-the-aws-marketplace",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#getting-started-with-voyage-ai",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#pricing"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are some quantitative metrics that can be used to measure the success of a sentiment analysis model, and how might specific targets for those metrics be determined?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are some quantitative metrics that can be used to measure the success of a sentiment analysis model, and how might specific targets for those metrics be determined?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/define-success#building-strong-criteria\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/define-success#common-success-criteria-to-consider",
|
|
"https://docs.claude.com/en/docs/build-with-claude/define-success#building-strong-criteria",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 5842,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "When using tools just to get Claude to produce JSON output following a particular schema, what key things should you do in terms of tool setup and prompting?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When using tools just to get Claude to produce JSON output following a particular schema, what key things should you do in terms of tool setup and prompting?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#tool-use-examples\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#json-output\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#controlling-claudes-output",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#next-steps",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#json-output"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1468,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "When using tools just to get Claude to produce JSON output following a particular schema, what key things should you do in terms of tool setup and prompting?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When using tools just to get Claude to produce JSON output following a particular schema, what key things should you do in terms of tool setup and prompting?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#tool-use-examples\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#json-output\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#json-output",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#next-steps"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1596,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the key differences between the legacy Claude Instant 1.2 model and the Claude 3 Haiku model in terms of capabilities and performance?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the key differences between the legacy Claude Instant 1.2 model and the Claude 3 Haiku model in terms of capabilities and performance?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison\",\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\",\"https://docs.claude.com/en/docs/about-claude/models#legacy-models\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/models#legacy-models",
|
|
"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.6666666666666666,
|
|
"namedScores": {},
|
|
"latencyMs": 1431,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 0.6666666666666666 \n F1 Score: 0.6666666666666666 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are some quantitative metrics that can be used to measure the success of a sentiment analysis model, and how might specific targets for those metrics be determined?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are some quantitative metrics that can be used to measure the success of a sentiment analysis model, and how might specific targets for those metrics be determined?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/define-success#building-strong-criteria\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/define-success#building-strong-criteria",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics",
|
|
"https://docs.claude.com/en/docs/build-with-claude/define-success#common-success-criteria-to-consider"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 8264,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the key differences between the legacy Claude Instant 1.2 model and the Claude 3 Haiku model in terms of capabilities and performance?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the key differences between the legacy Claude Instant 1.2 model and the Claude 3 Haiku model in terms of capabilities and performance?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison\",\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\",\"https://docs.claude.com/en/docs/about-claude/models#legacy-models\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/models#legacy-models",
|
|
"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison",
|
|
"https://docs.claude.com/en/docs/about-claude/models#model-comparison"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"latencyMs": 1263,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the key differences between the legacy Claude Instant 1.2 model and the Claude 3 Haiku model in terms of capabilities and performance?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the key differences between the legacy Claude Instant 1.2 model and the Claude 3 Haiku model in terms of capabilities and performance?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison\",\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\",\"https://docs.claude.com/en/docs/about-claude/models#legacy-models\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/models#legacy-models",
|
|
"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison",
|
|
"https://docs.claude.com/en/docs/about-claude/models#model-comparison"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is one key benefit of using examples when prompt engineering with Claude?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is one key benefit of using examples when prompt engineering with Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1185,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "When using tools just to get Claude to produce JSON output following a particular schema, what key things should you do in terms of tool setup and prompting?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When using tools just to get Claude to produce JSON output following a particular schema, what key things should you do in terms of tool setup and prompting?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#tool-use-examples\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#json-output\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#json-output",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#chain-of-thought"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 3757,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "According to the Claude Documentation, what is one key advantage of using prompt engineering instead of fine-tuning when it comes to adapting an AI model to new domains or tasks?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "According to the Claude Documentation, what is one key advantage of using prompt engineering instead of fine-tuning when it comes to adapting an AI model to new domains or tasks?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#before-prompt-engineering"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1107,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "According to the Claude Documentation, what is one key advantage of using prompt engineering instead of fine-tuning when it comes to adapting an AI model to new domains or tasks?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "According to the Claude Documentation, what is one key advantage of using prompt engineering instead of fine-tuning when it comes to adapting an AI model to new domains or tasks?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1083,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "According to the Claude Documentation, what is one key advantage of using prompt engineering instead of fine-tuning when it comes to adapting an AI model to new domains or tasks?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "According to the Claude Documentation, what is one key advantage of using prompt engineering instead of fine-tuning when it comes to adapting an AI model to new domains or tasks?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is one key benefit of using examples when prompt engineering with Claude?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is one key benefit of using examples when prompt engineering with Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#crafting-effective-examples"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 3331,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can I quickly get started using the Claude for Sheets extension with a pre-made template?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can I quickly get started using the Claude for Sheets extension with a pre-made template?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#claude-for-sheets-workbook-template\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#get-started-with-claude-for-sheets\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#claude-for-sheets-workbook-template",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#get-started-with-claude-for-sheets",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#claude-for-sheets-usage-examples"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1310,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can I quickly get started using the Claude for Sheets extension with a pre-made template?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can I quickly get started using the Claude for Sheets extension with a pre-made template?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#claude-for-sheets-workbook-template\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#get-started-with-claude-for-sheets\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#get-started-with-claude-for-sheets",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#claude-for-sheets-usage-examples",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#claude-for-sheets-workbook-template"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1334,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does the \"index\" field in the \"content_block_delta\" event relate to the text being streamed in a response?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does the \"index\" field in the \"content_block_delta\" event relate to the text being streamed in a response?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#basic-streaming-request\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#delta-types",
|
|
"https://docs.claude.com/en/api/messages-streaming#text-delta",
|
|
"https://docs.claude.com/en/api/messages-streaming#event-types"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1252,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does the \"index\" field in the \"content_block_delta\" event relate to the text being streamed in a response?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does the \"index\" field in the \"content_block_delta\" event relate to the text being streamed in a response?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#basic-streaming-request\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#delta-types",
|
|
"https://docs.claude.com/en/api/messages-streaming#text-delta",
|
|
"https://docs.claude.com/en/api/messages-streaming#event-types"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1264,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does the \"index\" field in the \"content_block_delta\" event relate to the text being streamed in a response?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does the \"index\" field in the \"content_block_delta\" event relate to the text being streamed in a response?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#basic-streaming-request\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#delta-types",
|
|
"https://docs.claude.com/en/api/messages-streaming#text-delta",
|
|
"https://docs.claude.com/en/api/messages-streaming#event-types"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is one key benefit of using examples when prompt engineering with Claude?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is one key benefit of using examples when prompt engineering with Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 5750,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you include an image as part of a Claude API request, and what image formats are currently supported?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you include an image as part of a Claude API request, and what image formats are currently supported?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#vision\",\"https://docs.claude.com/en/docs/build-with-claude/vision#about-the-prompt-examples\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#ensuring-image-quality",
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#faq",
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1472,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you include an image as part of a Claude API request, and what image formats are currently supported?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you include an image as part of a Claude API request, and what image formats are currently supported?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#vision\",\"https://docs.claude.com/en/docs/build-with-claude/vision#about-the-prompt-examples\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#ensuring-image-quality",
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#faq",
|
|
"https://docs.claude.com/en/api/messages-examples#vision"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1595,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can I quickly get started using the Claude for Sheets extension with a pre-made template?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can I quickly get started using the Claude for Sheets extension with a pre-made template?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#claude-for-sheets-workbook-template\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#get-started-with-claude-for-sheets\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#claude-for-sheets-workbook-template",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#get-started-with-claude-for-sheets",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 3557,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is the relationship between time to first token (TTFT) and latency when evaluating a language model's performance?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is the relationship between time to first token (TTFT) and latency when evaluating a language model's performance?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#ttft-time-to-first-token\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-measure-latency\",\"https://docs.claude.com/en/docs/resources/glossary#latency\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/resources/glossary#ttft-time-to-first-token",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-measure-latency",
|
|
"https://docs.claude.com/en/docs/resources/glossary#latency"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"latencyMs": 1394,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is the relationship between time to first token (TTFT) and latency when evaluating a language model's performance?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is the relationship between time to first token (TTFT) and latency when evaluating a language model's performance?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#ttft-time-to-first-token\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-measure-latency\",\"https://docs.claude.com/en/docs/resources/glossary#latency\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/resources/glossary#ttft-time-to-first-token",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-measure-latency",
|
|
"https://docs.claude.com/en/docs/resources/glossary#latency"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"latencyMs": 1445,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you include an image as part of a Claude API request, and what image formats are currently supported?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you include an image as part of a Claude API request, and what image formats are currently supported?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#vision\",\"https://docs.claude.com/en/docs/build-with-claude/vision#about-the-prompt-examples\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-examples#vision",
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#faq",
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#how-to-use-vision"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 3095,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can providing Claude with examples of handling certain edge cases like implicit requests or emotional prioritization help improve its performance in routing support tickets?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can providing Claude with examples of handling certain edge cases like implicit requests or emotional prioritization help improve its performance in routing support tickets?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#adapting-to-common-scenarios\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#prompting-claude-for-ticket-routing\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#adapting-to-common-scenarios",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#introduction",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1397,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can providing Claude with examples of handling certain edge cases like implicit requests or emotional prioritization help improve its performance in routing support tickets?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can providing Claude with examples of handling certain edge cases like implicit requests or emotional prioritization help improve its performance in routing support tickets?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#adapting-to-common-scenarios\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#prompting-claude-for-ticket-routing\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#adapting-to-common-scenarios",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#introduction",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1399,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is the relationship between time to first token (TTFT) and latency when evaluating a language model's performance?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is the relationship between time to first token (TTFT) and latency when evaluating a language model's performance?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#ttft-time-to-first-token\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-measure-latency\",\"https://docs.claude.com/en/docs/resources/glossary#latency\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/resources/glossary#ttft-time-to-first-token",
|
|
"https://docs.claude.com/en/docs/resources/glossary#latency",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-measure-latency"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"latencyMs": 3244,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does the stop_reason of \"tool_use\" relate to the overall workflow of integrating external tools with Claude?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does the stop_reason of \"tool_use\" relate to the overall workflow of integrating external tools with Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#tool-use-and-json-mode\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#pricing",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#next-steps"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1468,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does the stop_reason of \"tool_use\" relate to the overall workflow of integrating external tools with Claude?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does the stop_reason of \"tool_use\" relate to the overall workflow of integrating external tools with Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#tool-use-and-json-mode\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#next-steps",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#chain-of-thought"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1507,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "According to the documentation, what error event and corresponding HTTP error code may be sent during periods of high usage for the Claude API when using streaming responses?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "According to the documentation, what error event and corresponding HTTP error code may be sent during periods of high usage for the Claude API when using streaming responses?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#error-events\",\"https://docs.claude.com/en/api/streaming#error-event-types\",\"https://docs.claude.com/en/api/errors#http-errors\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/errors#http-errors",
|
|
"https://docs.claude.com/en/api/messages-streaming#error-events",
|
|
"https://docs.claude.com/en/api/streaming#error-event-types"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"latencyMs": 1216,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "According to the documentation, what error event and corresponding HTTP error code may be sent during periods of high usage for the Claude API when using streaming responses?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "According to the documentation, what error event and corresponding HTTP error code may be sent during periods of high usage for the Claude API when using streaming responses?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#error-events\",\"https://docs.claude.com/en/api/streaming#error-event-types\",\"https://docs.claude.com/en/api/errors#http-errors\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/errors#http-errors",
|
|
"https://docs.claude.com/en/api/messages-streaming#error-events",
|
|
"https://docs.claude.com/en/api/streaming#error-event-types"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"latencyMs": 1217,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "According to the documentation, what error event and corresponding HTTP error code may be sent during periods of high usage for the Claude API when using streaming responses?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "According to the documentation, what error event and corresponding HTTP error code may be sent during periods of high usage for the Claude API when using streaming responses?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#error-events\",\"https://docs.claude.com/en/api/streaming#error-event-types\",\"https://docs.claude.com/en/api/errors#http-errors\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/errors#http-errors",
|
|
"https://docs.claude.com/en/api/messages-streaming#error-events",
|
|
"https://docs.claude.com/en/api/streaming#error-event-types"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can providing Claude with examples of handling certain edge cases like implicit requests or emotional prioritization help improve its performance in routing support tickets?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can providing Claude with examples of handling certain edge cases like implicit requests or emotional prioritization help improve its performance in routing support tickets?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#adapting-to-common-scenarios\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#prompting-claude-for-ticket-routing\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#adapting-to-common-scenarios",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#advantages-of-using-claude",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 4102,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does the stop_reason of \"tool_use\" relate to the overall workflow of integrating external tools with Claude?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does the stop_reason of \"tool_use\" relate to the overall workflow of integrating external tools with Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#tool-use-and-json-mode\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#chain-of-thought",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#forcing-tool-use"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 3300,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two types of deltas that can be contained in a content_block_delta event when streaming responses from the Claude API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two types of deltas that can be contained in a content_block_delta event when streaming responses from the Claude API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#delta-types\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#delta-types",
|
|
"https://docs.claude.com/en/api/messages-streaming#text-delta",
|
|
"https://docs.claude.com/en/api/messages-streaming#input-json-delta"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1406,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "On what date did Claude 3.5 Sonnet and tool use both become generally available across the Claude API, Amazon Bedrock, and Google Vertex AI?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "On what date did Claude 3.5 Sonnet and tool use both become generally available across the Claude API, Amazon Bedrock, and Google Vertex AI?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#june-20th-2024\",\"https://docs.claude.com/en/release-notes/api#may-30th-2024\"]"
|
|
},
|
|
"error": "Error: Error running Python script: _pickle.UnpicklingError: pickle data was truncated\nStack Trace: Error: _pickle.UnpicklingError: pickle data was truncated\n at PythonShell.parseError (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/python-shell/index.js:303:21)\n at terminateIfNeeded (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/python-shell/index.js:193:32)\n at ChildProcess.<anonymous> (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/python-shell/index.js:185:13)\n at ChildProcess.emit (node:events:519:28)\n at ChildProcess._handle.onexit (node:internal/child_process:294:12)\n --Python Traceback: --\n File \"/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/python/wrapper.py\", line 34, in <module>\n result = call_method(script_path, method_name, *data)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/python/wrapper.py\", line 18, in call_method\n spec.loader.exec_module(script_module)\n File \"<frozen importlib._bootstrap_external>\", line 940, in exec_module\n File \"<frozen importlib._bootstrap>\", line 241, in _call_with_frames_removed\n File \"/Users/sflamini/code/anthropic-cookbook/skills/retrieval_augmented_generation/evaluation/provider_retrieval.py\", line 114, in <module>\n db_rerank.load_data(anthropic_docs_summaries)\n File \"/Users/sflamini/code/anthropic-cookbook/skills/retrieval_augmented_generation/evaluation/vectordb.py\", line 108, in load_data\n self.load_db()\n File \"/Users/sflamini/code/anthropic-cookbook/skills/retrieval_augmented_generation/evaluation/vectordb.py\", line 169, in load_db\n data = pickle.load(file)\n ^^^^^^^^^^^^^^^^^\n\nError: Error running Python script: _pickle.UnpicklingError: pickle data was truncated\nStack Trace: Error: _pickle.UnpicklingError: pickle data was truncated\n at PythonShell.parseError (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/python-shell/index.js:303:21)\n at terminateIfNeeded (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/python-shell/index.js:193:32)\n at ChildProcess.<anonymous> (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/python-shell/index.js:185:13)\n at ChildProcess.emit (node:events:519:28)\n at ChildProcess._handle.onexit (node:internal/child_process:294:12)\n --Python Traceback: --\n File \"/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/python/wrapper.py\", line 34, in <module>\n result = call_method(script_path, method_name, *data)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/python/wrapper.py\", line 18, in call_method\n spec.loader.exec_module(script_module)\n File \"<frozen importlib._bootstrap_external>\", line 940, in exec_module\n File \"<frozen importlib._bootstrap>\", line 241, in _call_with_frames_removed\n File \"/Users/sflamini/code/anthropic-cookbook/skills/retrieval_augmented_generation/evaluation/provider_retrieval.py\", line 114, in <module>\n db_rerank.load_data(anthropic_docs_summaries)\n File \"/Users/sflamini/code/anthropic-cookbook/skills/retrieval_augmented_generation/evaluation/vectordb.py\", line 108, in load_data\n self.load_db()\n File \"/Users/sflamini/code/anthropic-cookbook/skills/retrieval_augmented_generation/evaluation/vectordb.py\", line 169, in load_db\n data = pickle.load(file)\n ^^^^^^^^^^^^^^^^^\n at runPython (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/python/wrapper.js:50:15)\n at process.processTicksAndRejections (node:internal/process/task_queues:95:5)\n at async PythonProvider.executePythonScript (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/providers/pythonCompletion.js:52:31)\n at async Evaluator.runEval (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/evaluator.js:297:28)\n at async processEvalStep (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/evaluator.js:619:25)",
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 0
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two types of deltas that can be contained in a content_block_delta event when streaming responses from the Claude API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two types of deltas that can be contained in a content_block_delta event when streaming responses from the Claude API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#delta-types\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#delta-types",
|
|
"https://docs.claude.com/en/api/messages-streaming#text-delta",
|
|
"https://docs.claude.com/en/api/messages-streaming#input-json-delta"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1881,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "On what date did Claude 3.5 Sonnet and tool use both become generally available across the Claude API, Amazon Bedrock, and Google Vertex AI?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "On what date did Claude 3.5 Sonnet and tool use both become generally available across the Claude API, Amazon Bedrock, and Google Vertex AI?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#june-20th-2024\",\"https://docs.claude.com/en/release-notes/api#may-30th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/release-notes/api#june-20th-2024",
|
|
"https://docs.claude.com/en/release-notes/api#may-30th-2024",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#claude-3-5-family"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1483,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "In what order did Anthropic launch Claude.ai and the Claude iOS app in Canada and Europe?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "In what order did Anthropic launch Claude.ai and the Claude iOS app in Canada and Europe?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024",
|
|
"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#start-building-with-claude"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1169,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "In what order did Anthropic launch Claude.ai and the Claude iOS app in Canada and Europe?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "In what order did Anthropic launch Claude.ai and the Claude iOS app in Canada and Europe?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024",
|
|
"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#start-building-with-claude"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 14,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "In what order did Anthropic launch Claude.ai and the Claude iOS app in Canada and Europe?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "In what order did Anthropic launch Claude.ai and the Claude iOS app in Canada and Europe?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024",
|
|
"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024",
|
|
"https://docs.claude.com/en/api/claude-on-vertex-ai#model-availability"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1252,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "When the API response from Claude has a stop_reason of \"tool_use\", what does this indicate and what should be done next to continue the conversation?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When the API response from Claude has a stop_reason of \"tool_use\", what does this indicate and what should be done next to continue the conversation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#json-output\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#forcing-tool-use",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 925,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "When the API response from Claude has a stop_reason of \"tool_use\", what does this indicate and what should be done next to continue the conversation?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When the API response from Claude has a stop_reason of \"tool_use\", what does this indicate and what should be done next to continue the conversation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#json-output\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#forcing-tool-use",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 2,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "On what date did Claude 3.5 Sonnet and tool use both become generally available across the Claude API, Amazon Bedrock, and Google Vertex AI?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "On what date did Claude 3.5 Sonnet and tool use both become generally available across the Claude API, Amazon Bedrock, and Google Vertex AI?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#june-20th-2024\",\"https://docs.claude.com/en/release-notes/api#may-30th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/release-notes/api#june-20th-2024",
|
|
"https://docs.claude.com/en/release-notes/api#may-30th-2024",
|
|
"https://docs.claude.com/en/docs/about-claude/models#model-names"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 2978,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "When the API response from Claude has a stop_reason of \"tool_use\", what does this indicate and what should be done next to continue the conversation?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When the API response from Claude has a stop_reason of \"tool_use\", what does this indicate and what should be done next to continue the conversation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#json-output\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#forcing-tool-use",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1459,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What Python libraries are used in the example code snippet for evaluating tone and style in a customer service chatbot?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What Python libraries are used in the example code snippet for evaluating tone and style in a customer service chatbot?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#example-evals\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/client-sdks#python",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-python-package",
|
|
"https://docs.claude.com/en/api/prompt-validation#examples"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1183,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What Python libraries are used in the example code snippet for evaluating tone and style in a customer service chatbot?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What Python libraries are used in the example code snippet for evaluating tone and style in a customer service chatbot?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#example-evals\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/client-sdks#python",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-python-package",
|
|
"https://docs.claude.com/en/api/prompt-validation#examples"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What Python libraries are used in the example code snippet for evaluating tone and style in a customer service chatbot?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What Python libraries are used in the example code snippet for evaluating tone and style in a customer service chatbot?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#example-evals\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/client-sdks#python",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-python-package",
|
|
"https://docs.claude.com/en/api/prompt-validation#examples"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1431,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two main ways to authenticate when using the Anthropic Python SDK to access Claude models on Amazon Bedrock?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two main ways to authenticate when using the Anthropic Python SDK to access Claude models on Amazon Bedrock?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#install-an-sdk-for-accessing-bedrock\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests",
|
|
"https://docs.claude.com/en/api/claude-on-amazon-bedrock#install-an-sdk-for-accessing-bedrock",
|
|
"https://docs.claude.com/en/api/claude-on-vertex-ai#making-requests"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1543,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two main ways to authenticate when using the Anthropic Python SDK to access Claude models on Amazon Bedrock?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two main ways to authenticate when using the Anthropic Python SDK to access Claude models on Amazon Bedrock?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#install-an-sdk-for-accessing-bedrock\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests",
|
|
"https://docs.claude.com/en/api/claude-on-amazon-bedrock#subscribe-to-anthropic-models",
|
|
"https://docs.claude.com/en/api/claude-on-amazon-bedrock#accessing-bedrock"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1577,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two types of deltas that can be contained in a content_block_delta event when streaming responses from the Claude API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two types of deltas that can be contained in a content_block_delta event when streaming responses from the Claude API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#delta-types\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#text-delta",
|
|
"https://docs.claude.com/en/api/messages-streaming#input-json-delta",
|
|
"https://docs.claude.com/en/api/messages-streaming#delta-types"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 8176,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "When deciding whether to implement leak-resistant prompt engineering strategies, what two factors should be considered and balanced?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When deciding whether to implement leak-resistant prompt engineering strategies, what two factors should be considered and balanced?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#strategies-to-reduce-prompt-leak\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#strategies-to-reduce-prompt-leak"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"latencyMs": 1337,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "When deciding whether to implement leak-resistant prompt engineering strategies, what two factors should be considered and balanced?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When deciding whether to implement leak-resistant prompt engineering strategies, what two factors should be considered and balanced?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#strategies-to-reduce-prompt-leak\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#strategies-to-reduce-prompt-leak"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"latencyMs": 1365,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can selecting the appropriate Claude model based on your specific requirements help reduce latency in your application?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can selecting the appropriate Claude model based on your specific requirements help reduce latency in your application?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\",\"https://docs.claude.com/en/docs/intro-to-claude#model-options\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#model-options"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1379,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can selecting the appropriate Claude model based on your specific requirements help reduce latency in your application?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can selecting the appropriate Claude model based on your specific requirements help reduce latency in your application?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\",\"https://docs.claude.com/en/docs/intro-to-claude#model-options\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#model-options",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-reduce-latency"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1372,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can selecting the appropriate Claude model based on your specific requirements help reduce latency in your application?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can selecting the appropriate Claude model based on your specific requirements help reduce latency in your application?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\",\"https://docs.claude.com/en/docs/intro-to-claude#model-options\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#model-options",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-reduce-latency"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "When deciding whether to implement leak-resistant prompt engineering strategies, what two factors should be considered and balanced?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When deciding whether to implement leak-resistant prompt engineering strategies, what two factors should be considered and balanced?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#strategies-to-reduce-prompt-leak\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#strategies-to-reduce-prompt-leak"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"latencyMs": 2928,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you stream responses from the Claude API using the Python SDK?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you stream responses from the Claude API using the Python SDK?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#streaming-with-sdks\",\"https://docs.claude.com/en/api/client-sdks#python\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#streaming-with-sdks",
|
|
"https://docs.claude.com/en/api/messages-streaming#basic-streaming-request",
|
|
"https://docs.claude.com/en/api/#authentication"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1297,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you stream responses from the Claude API using the Python SDK?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you stream responses from the Claude API using the Python SDK?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#streaming-with-sdks\",\"https://docs.claude.com/en/api/client-sdks#python\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#streaming-with-sdks",
|
|
"https://docs.claude.com/en/docs/quickstart#prerequisites",
|
|
"https://docs.claude.com/en/api/messages-streaming#basic-streaming-request"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1396,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you guide Claude's response by pre-filling part of the response, and what API parameter is used to generate a short response in this case?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you guide Claude's response by pre-filling part of the response, and what API parameter is used to generate a short response in this case?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\",\"https://docs.claude.com/en/api/messages-examples#basic-request-and-response\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#how-to-prefill-claudes-response",
|
|
"https://docs.claude.com/en/docs/welcome#get-started",
|
|
"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1193,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you guide Claude's response by pre-filling part of the response, and what API parameter is used to generate a short response in this case?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you guide Claude's response by pre-filling part of the response, and what API parameter is used to generate a short response in this case?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\",\"https://docs.claude.com/en/api/messages-examples#basic-request-and-response\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#how-to-prefill-claudes-response",
|
|
"https://docs.claude.com/en/docs/welcome#get-started",
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1265,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you guide Claude's response by pre-filling part of the response, and what API parameter is used to generate a short response in this case?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you guide Claude's response by pre-filling part of the response, and what API parameter is used to generate a short response in this case?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\",\"https://docs.claude.com/en/api/messages-examples#basic-request-and-response\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#how-to-prefill-claudes-response",
|
|
"https://docs.claude.com/en/docs/welcome#get-started",
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is more important when building an eval set for an AI system - having a larger number of test cases with automated grading, or having fewer high-quality test cases graded by humans?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is more important when building an eval set for an AI system - having a larger number of test cases with automated grading, or having fewer high-quality test cases graded by humans?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#eval-design-principles\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#building-evals-and-test-cases\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/develop-tests#eval-design-principles",
|
|
"https://docs.claude.com/en/docs/build-with-claude/develop-tests#grading-evals",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1322,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is more important when building an eval set for an AI system - having a larger number of test cases with automated grading, or having fewer high-quality test cases graded by humans?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is more important when building an eval set for an AI system - having a larger number of test cases with automated grading, or having fewer high-quality test cases graded by humans?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#eval-design-principles\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#building-evals-and-test-cases\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/develop-tests#eval-design-principles",
|
|
"https://docs.claude.com/en/docs/build-with-claude/develop-tests#grading-evals",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1280,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is more important when building an eval set for an AI system - having a larger number of test cases with automated grading, or having fewer high-quality test cases graded by humans?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is more important when building an eval set for an AI system - having a larger number of test cases with automated grading, or having fewer high-quality test cases graded by humans?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#eval-design-principles\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#building-evals-and-test-cases\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/develop-tests#eval-design-principles",
|
|
"https://docs.claude.com/en/docs/build-with-claude/develop-tests#grading-evals",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two main ways to authenticate when using the Anthropic Python SDK to access Claude models on Amazon Bedrock?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two main ways to authenticate when using the Anthropic Python SDK to access Claude models on Amazon Bedrock?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#install-an-sdk-for-accessing-bedrock\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests",
|
|
"https://docs.claude.com/en/api/claude-on-amazon-bedrock#install-an-sdk-for-accessing-bedrock",
|
|
"https://docs.claude.com/en/api/claude-on-amazon-bedrock#api-model-names"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 8341,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you stream responses from the Claude API using the Python SDK?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you stream responses from the Claude API using the Python SDK?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#streaming-with-sdks\",\"https://docs.claude.com/en/api/client-sdks#python\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#streaming-with-sdks",
|
|
"https://docs.claude.com/en/api/client-sdks#python",
|
|
"https://docs.claude.com/en/api/messages-streaming#basic-streaming-request"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 3824,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two required fields in a content_block_delta event for a text delta type?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two required fields in a content_block_delta event for a text delta type?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#delta-types\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#delta-types",
|
|
"https://docs.claude.com/en/api/messages-streaming#text-delta",
|
|
"https://docs.claude.com/en/api/messages-streaming#input-json-delta"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1478,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two required fields in a content_block_delta event for a text delta type?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two required fields in a content_block_delta event for a text delta type?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#delta-types\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#text-delta",
|
|
"https://docs.claude.com/en/api/messages-streaming#delta-types",
|
|
"https://docs.claude.com/en/api/messages-streaming#input-json-delta"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1507,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two interactive ways to learn how to use Claude's capabilities, such as uploading PDFs and generating embeddings?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two interactive ways to learn how to use Claude's capabilities, such as uploading PDFs and generating embeddings?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/quickstart#next-steps\",\"https://docs.claude.com/en/docs/welcome#develop-with-claude\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook",
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources",
|
|
"https://docs.claude.com/en/docs/about-claude/models#get-started-with-claude"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1487,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two required fields in a content_block_delta event for a text delta type?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two required fields in a content_block_delta event for a text delta type?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#delta-types\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#text-delta",
|
|
"https://docs.claude.com/en/api/messages-streaming#delta-types",
|
|
"https://docs.claude.com/en/api/messages-streaming#event-types"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 2091,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two interactive ways to learn how to use Claude's capabilities, such as uploading PDFs and generating embeddings?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two interactive ways to learn how to use Claude's capabilities, such as uploading PDFs and generating embeddings?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/quickstart#next-steps\",\"https://docs.claude.com/en/docs/welcome#develop-with-claude\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#claude-for-sheets-usage-examples",
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#further-information"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1628,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "Why does breaking a task into distinct subtasks for chained prompts help improve Claude's accuracy on the overall task?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "Why does breaking a task into distinct subtasks for chained prompts help improve Claude's accuracy on the overall task?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#why-let-claude-think"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1533,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "Why does breaking a task into distinct subtasks for chained prompts help improve Claude's accuracy on the overall task?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "Why does breaking a task into distinct subtasks for chained prompts help improve Claude's accuracy on the overall task?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#why-let-claude-think"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1486,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does the streaming format for Messages responses differ from Text Completions streaming responses?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does the streaming format for Messages responses differ from Text Completions streaming responses?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#3-leverage-streaming"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1248,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does the streaming format for Messages responses differ from Text Completions streaming responses?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does the streaming format for Messages responses differ from Text Completions streaming responses?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#3-leverage-streaming"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1353,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two interactive ways to learn how to use Claude's capabilities, such as uploading PDFs and generating embeddings?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two interactive ways to learn how to use Claude's capabilities, such as uploading PDFs and generating embeddings?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/quickstart#next-steps\",\"https://docs.claude.com/en/docs/welcome#develop-with-claude\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook",
|
|
"https://docs.claude.com/en/docs/welcome#develop-with-claude",
|
|
"https://docs.claude.com/en/docs/welcome#get-started"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 3638,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does the streaming format for Messages responses differ from Text Completions streaming responses?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does the streaming format for Messages responses differ from Text Completions streaming responses?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#3-leverage-streaming"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two ways to start experimenting with Claude as a user, according to Anthropic's documentation?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two ways to start experimenting with Claude as a user, according to Anthropic's documentation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/models#get-started-with-claude\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/intro-to-claude#start-building-with-claude",
|
|
"https://docs.claude.com/en/docs/welcome#develop-with-claude",
|
|
"https://docs.claude.com/en/docs/welcome#get-started"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1179,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two ways to start experimenting with Claude as a user, according to Anthropic's documentation?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two ways to start experimenting with Claude as a user, according to Anthropic's documentation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/models#get-started-with-claude\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/welcome#get-started",
|
|
"https://docs.claude.com/en/docs/about-claude/models#get-started-with-claude",
|
|
"https://docs.claude.com/en/docs/welcome#develop-with-claude"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1251,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "Why does breaking a task into distinct subtasks for chained prompts help improve Claude's accuracy on the overall task?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "Why does breaking a task into distinct subtasks for chained prompts help improve Claude's accuracy on the overall task?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#chain-prompts-for-complex-tasks",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#when-to-chain-prompts"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 3546,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can using chain prompts help reduce errors and inconsistency in complex tasks handled by Claude?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can using chain prompts help reduce errors and inconsistency in complex tasks handled by Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#chain-prompts-for-complex-tasks\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#when-to-chain-prompts",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#chain-prompts-for-complex-tasks"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1490,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can using chain prompts help reduce errors and inconsistency in complex tasks handled by Claude?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can using chain prompts help reduce errors and inconsistency in complex tasks handled by Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#chain-prompts-for-complex-tasks\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#when-to-chain-prompts",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#chain-prompts-for-complex-tasks"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1446,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What HTTP status code does an overloaded_error event correspond to in a non-streaming context for the Claude API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What HTTP status code does an overloaded_error event correspond to in a non-streaming context for the Claude API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/streaming#error-event-types\",\"https://docs.claude.com/en/api/messages-streaming#error-events\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#error-events",
|
|
"https://docs.claude.com/en/api/streaming#error-event-types",
|
|
"https://docs.claude.com/en/api/errors#http-errors"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1073,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What HTTP status code does an overloaded_error event correspond to in a non-streaming context for the Claude API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What HTTP status code does an overloaded_error event correspond to in a non-streaming context for the Claude API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/streaming#error-event-types\",\"https://docs.claude.com/en/api/messages-streaming#error-events\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#error-events",
|
|
"https://docs.claude.com/en/api/streaming#error-event-types",
|
|
"https://docs.claude.com/en/api/errors#http-errors"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1205,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What HTTP status code does an overloaded_error event correspond to in a non-streaming context for the Claude API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What HTTP status code does an overloaded_error event correspond to in a non-streaming context for the Claude API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/streaming#error-event-types\",\"https://docs.claude.com/en/api/messages-streaming#error-events\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#error-events",
|
|
"https://docs.claude.com/en/api/streaming#error-event-types",
|
|
"https://docs.claude.com/en/api/errors#http-errors"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can using chain prompts help reduce errors and inconsistency in complex tasks handled by Claude?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can using chain prompts help reduce errors and inconsistency in complex tasks handled by Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#chain-prompts-for-complex-tasks\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#chain-prompts-for-complex-tasks",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#when-to-chain-prompts"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 3218,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two ways to specify the format in which Voyage AI returns embeddings through its HTTP API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two ways to specify the format in which Voyage AI returns embeddings through its HTTP API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#getting-started-with-voyage-ai",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1260,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two ways to specify the format in which Voyage AI returns embeddings through its HTTP API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two ways to specify the format in which Voyage AI returns embeddings through its HTTP API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#getting-started-with-voyage-ai",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1307,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "When streaming API requests that use tools, how are the input JSON deltas for tool_use content blocks sent, and how can they be accumulated and parsed by the client?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When streaming API requests that use tools, how are the input JSON deltas for tool_use content blocks sent, and how can they be accumulated and parsed by the client?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#streaming-request-with-tool-use\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#input-json-delta",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works",
|
|
"https://docs.claude.com/en/api/messages-streaming#text-delta"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1308,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "When streaming API requests that use tools, how are the input JSON deltas for tool_use content blocks sent, and how can they be accumulated and parsed by the client?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When streaming API requests that use tools, how are the input JSON deltas for tool_use content blocks sent, and how can they be accumulated and parsed by the client?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#streaming-request-with-tool-use\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#input-json-delta",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works",
|
|
"https://docs.claude.com/en/api/messages-streaming#text-delta"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1372,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "When streaming API requests that use tools, how are the input JSON deltas for tool_use content blocks sent, and how can they be accumulated and parsed by the client?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When streaming API requests that use tools, how are the input JSON deltas for tool_use content blocks sent, and how can they be accumulated and parsed by the client?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#streaming-request-with-tool-use\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#input-json-delta",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works",
|
|
"https://docs.claude.com/en/api/messages-streaming#text-delta"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two ways to specify the format in which Voyage AI returns embeddings through its HTTP API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two ways to specify the format in which Voyage AI returns embeddings through its HTTP API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-python-package",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 3016,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two interactive prompt engineering tutorials that Anthropic offers, and how do they differ?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two interactive prompt engineering tutorials that Anthropic offers, and how do they differ?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1363,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two interactive prompt engineering tutorials that Anthropic offers, and how do they differ?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two interactive prompt engineering tutorials that Anthropic offers, and how do they differ?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1322,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are some of the key capabilities that make Claude suitable for enterprise use cases requiring integration with specialized applications and processing of large volumes of sensitive data?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are some of the key capabilities that make Claude suitable for enterprise use cases requiring integration with specialized applications and processing of large volumes of sensitive data?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/intro-to-claude#enterprise-considerations\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/intro-to-claude#model-options",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#enterprise-considerations",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#implementing-claude"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1275,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two ways to start experimenting with Claude as a user, according to Anthropic's documentation?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two ways to start experimenting with Claude as a user, according to Anthropic's documentation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/models#get-started-with-claude\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/models#get-started-with-claude",
|
|
"https://docs.claude.com/en/docs/welcome#get-started",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#start-building-with-claude"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 9837,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are some of the key capabilities that make Claude suitable for enterprise use cases requiring integration with specialized applications and processing of large volumes of sensitive data?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are some of the key capabilities that make Claude suitable for enterprise use cases requiring integration with specialized applications and processing of large volumes of sensitive data?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/intro-to-claude#enterprise-considerations\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/intro-to-claude#enterprise-considerations",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#what-you-can-do-with-claude",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#model-options"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1406,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are some of the key capabilities that make Claude suitable for enterprise use cases requiring integration with specialized applications and processing of large volumes of sensitive data?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are some of the key capabilities that make Claude suitable for enterprise use cases requiring integration with specialized applications and processing of large volumes of sensitive data?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/intro-to-claude#enterprise-considerations\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/intro-to-claude#enterprise-considerations",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#what-you-can-do-with-claude",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#model-options"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "As of June 2024, in which regions are Anthropic's Claude.ai API and iOS app available?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "As of June 2024, in which regions are Anthropic's Claude.ai API and iOS app available?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/claude-apps#may-1st-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024",
|
|
"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024",
|
|
"https://docs.claude.com/en/api/claude-on-vertex-ai#model-availability"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.6666666666666666,
|
|
"namedScores": {},
|
|
"latencyMs": 1268,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 0.6666666666666666 \n F1 Score: 0.6666666666666666 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "As of June 2024, in which regions are Anthropic's Claude.ai API and iOS app available?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "As of June 2024, in which regions are Anthropic's Claude.ai API and iOS app available?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/claude-apps#may-1st-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024",
|
|
"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024",
|
|
"https://docs.claude.com/en/api/claude-on-vertex-ai#model-availability"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.6666666666666666,
|
|
"namedScores": {},
|
|
"latencyMs": 1303,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 0.6666666666666666 \n F1 Score: 0.6666666666666666 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two interactive prompt engineering tutorials that Anthropic offers, and how do they differ?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two interactive prompt engineering tutorials that Anthropic offers, and how do they differ?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prompt-generator#next-steps",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 3510,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two main approaches for integrating Claude into a support ticket workflow, and how do they differ in terms of scalability and ease of implementation?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two main approaches for integrating Claude into a support ticket workflow, and how do they differ in terms of scalability and ease of implementation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#introduction\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#advantages-of-using-claude",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#introduction"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1565,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two main approaches for integrating Claude into a support ticket workflow, and how do they differ in terms of scalability and ease of implementation?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two main approaches for integrating Claude into a support ticket workflow, and how do they differ in terms of scalability and ease of implementation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#introduction\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#introduction"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1540,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "When did Anthropic release a prompt generator tool to help guide Claude in generating high-quality prompts, and through what interface is it available?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When did Anthropic release a prompt generator tool to help guide Claude in generating high-quality prompts, and through what interface is it available?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#may-10th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources",
|
|
"https://docs.claude.com/en/docs/welcome#develop-with-claude",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#start-building-with-claude"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1194,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "When did Anthropic release a prompt generator tool to help guide Claude in generating high-quality prompts, and through what interface is it available?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When did Anthropic release a prompt generator tool to help guide Claude in generating high-quality prompts, and through what interface is it available?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#may-10th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/release-notes/api#may-10th-2024",
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1253,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "When did Anthropic release a prompt generator tool to help guide Claude in generating high-quality prompts, and through what interface is it available?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When did Anthropic release a prompt generator tool to help guide Claude in generating high-quality prompts, and through what interface is it available?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#may-10th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/release-notes/api#may-10th-2024",
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "Which Claude 3 model provides the best balance of intelligence and speed for high-throughput tasks like sales forecasting and targeted marketing?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "Which Claude 3 model provides the best balance of intelligence and speed for high-throughput tasks like sales forecasting and targeted marketing?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-vertex-ai#api-model-names\",\"https://docs.claude.com/en/docs/intro-to-claude#claude-3-family\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/intro-to-claude#claude-3-family",
|
|
"https://docs.claude.com/en/docs/about-claude/models#model-comparison",
|
|
"https://docs.claude.com/en/docs/welcome#models"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1257,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "Which Claude 3 model provides the best balance of intelligence and speed for high-throughput tasks like sales forecasting and targeted marketing?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "Which Claude 3 model provides the best balance of intelligence and speed for high-throughput tasks like sales forecasting and targeted marketing?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-vertex-ai#api-model-names\",\"https://docs.claude.com/en/docs/intro-to-claude#claude-3-family\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification",
|
|
"https://docs.claude.com/en/docs/welcome#models",
|
|
"https://docs.claude.com/en/docs/about-claude/models#model-comparison"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1350,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "Which Claude 3 model provides the best balance of intelligence and speed for high-throughput tasks like sales forecasting and targeted marketing?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "Which Claude 3 model provides the best balance of intelligence and speed for high-throughput tasks like sales forecasting and targeted marketing?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-vertex-ai#api-model-names\",\"https://docs.claude.com/en/docs/intro-to-claude#claude-3-family\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification",
|
|
"https://docs.claude.com/en/docs/welcome#models",
|
|
"https://docs.claude.com/en/docs/about-claude/models#model-comparison"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you calculate the similarity between two Voyage embedding vectors, and what is this equivalent to since Voyage embeddings are normalized to length 1?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you calculate the similarity between two Voyage embedding vectors, and what is this equivalent to since Voyage embeddings are normalized to length 1?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#faq\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#faq",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1318,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you calculate the similarity between two Voyage embedding vectors, and what is this equivalent to since Voyage embeddings are normalized to length 1?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you calculate the similarity between two Voyage embedding vectors, and what is this equivalent to since Voyage embeddings are normalized to length 1?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#faq\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#faq",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1290,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you calculate the similarity between two Voyage embedding vectors, and what is this equivalent to since Voyage embeddings are normalized to length 1?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you calculate the similarity between two Voyage embedding vectors, and what is this equivalent to since Voyage embeddings are normalized to length 1?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#faq\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#faq",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "As of June 2024, in which regions are Anthropic's Claude.ai API and iOS app available?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "As of June 2024, in which regions are Anthropic's Claude.ai API and iOS app available?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/claude-apps#may-1st-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024",
|
|
"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024",
|
|
"https://docs.claude.com/en/release-notes/claude-apps#may-1st-2024"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"latencyMs": 8043,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can using examples in prompts improve Claude's performance on complex tasks?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can using examples in prompts improve Claude's performance on complex tasks?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#chain-prompts-for-complex-tasks\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#iterating-your-prompt-for-better-performance",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1395,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can using examples in prompts improve Claude's performance on complex tasks?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can using examples in prompts improve Claude's performance on complex tasks?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#chain-prompts-for-complex-tasks\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#iterating-your-prompt-for-better-performance",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1433,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two types of content block deltas that can be emitted when streaming responses with tool use, and what does each delta type contain?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two types of content block deltas that can be emitted when streaming responses with tool use, and what does each delta type contain?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#streaming-request-with-tool-use\",\"https://docs.claude.com/en/api/messages-streaming#delta-types\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#delta-types",
|
|
"https://docs.claude.com/en/api/messages-streaming#input-json-delta",
|
|
"https://docs.claude.com/en/api/messages-streaming#text-delta"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8571428571428571,
|
|
"namedScores": {},
|
|
"latencyMs": 1231,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "Precision: 1.0 \n Recall: 0.75 \n F1 Score: 0.8571428571428571 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.75,
|
|
"reason": "Recall is 0.75",
|
|
"named_scores": {
|
|
"Recall": 0.75
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "F1 is 0.8571428571428571",
|
|
"named_scores": {
|
|
"F1": 0.8571428571428571
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.75,
|
|
"reason": "Recall is 0.75",
|
|
"named_scores": {
|
|
"Recall": 0.75
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "F1 is 0.8571428571428571",
|
|
"named_scores": {
|
|
"F1": 0.8571428571428571
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two types of content block deltas that can be emitted when streaming responses with tool use, and what does each delta type contain?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two types of content block deltas that can be emitted when streaming responses with tool use, and what does each delta type contain?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#streaming-request-with-tool-use\",\"https://docs.claude.com/en/api/messages-streaming#delta-types\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#delta-types",
|
|
"https://docs.claude.com/en/api/messages-streaming#input-json-delta",
|
|
"https://docs.claude.com/en/api/messages-streaming#text-delta"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8571428571428571,
|
|
"namedScores": {},
|
|
"latencyMs": 1250,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "Precision: 1.0 \n Recall: 0.75 \n F1 Score: 0.8571428571428571 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.75,
|
|
"reason": "Recall is 0.75",
|
|
"named_scores": {
|
|
"Recall": 0.75
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "F1 is 0.8571428571428571",
|
|
"named_scores": {
|
|
"F1": 0.8571428571428571
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.75,
|
|
"reason": "Recall is 0.75",
|
|
"named_scores": {
|
|
"Recall": 0.75
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "F1 is 0.8571428571428571",
|
|
"named_scores": {
|
|
"F1": 0.8571428571428571
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two main approaches for integrating Claude into a support ticket workflow, and how do they differ in terms of scalability and ease of implementation?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two main approaches for integrating Claude into a support ticket workflow, and how do they differ in terms of scalability and ease of implementation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#introduction\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#adapting-to-common-scenarios"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 8149,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two types of content block deltas that can be emitted when streaming responses with tool use, and what does each delta type contain?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two types of content block deltas that can be emitted when streaming responses with tool use, and what does each delta type contain?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#streaming-request-with-tool-use\",\"https://docs.claude.com/en/api/messages-streaming#delta-types\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#delta-types",
|
|
"https://docs.claude.com/en/api/messages-streaming#input-json-delta",
|
|
"https://docs.claude.com/en/api/messages-streaming#text-delta"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8571428571428571,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "Precision: 1.0 \n Recall: 0.75 \n F1 Score: 0.8571428571428571 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.75,
|
|
"reason": "Recall is 0.75",
|
|
"named_scores": {
|
|
"Recall": 0.75
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "F1 is 0.8571428571428571",
|
|
"named_scores": {
|
|
"F1": 0.8571428571428571
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.75,
|
|
"reason": "Recall is 0.75",
|
|
"named_scores": {
|
|
"Recall": 0.75
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "F1 is 0.8571428571428571",
|
|
"named_scores": {
|
|
"F1": 0.8571428571428571
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two key capabilities of Claude that enable it to build interactive systems and personalized user experiences?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two key capabilities of Claude that enable it to build interactive systems and personalized user experiences?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/text-generation#text-capabilities-and-use-cases\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/intro-to-claude#what-you-can-do-with-claude",
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#text-capabilities-and-use-cases",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#enterprise-considerations"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1302,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can using examples in prompts improve Claude's performance on complex tasks?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can using examples in prompts improve Claude's performance on complex tasks?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#chain-prompts-for-complex-tasks\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#iterating-your-prompt-for-better-performance",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#adapting-to-common-scenarios"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 3354,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two key capabilities of Claude that enable it to build interactive systems and personalized user experiences?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two key capabilities of Claude that enable it to build interactive systems and personalized user experiences?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/text-generation#text-capabilities-and-use-cases\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/intro-to-claude#what-you-can-do-with-claude",
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#text-capabilities-and-use-cases",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#implementing-claude"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1387,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the key event types included in a raw HTTP stream response when using message streaming, and what is the typical order they occur in?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the key event types included in a raw HTTP stream response when using message streaming, and what is the typical order they occur in?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#event-types\",\"https://docs.claude.com/en/api/messages-streaming#raw-http-stream-response\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#raw-http-stream-response",
|
|
"https://docs.claude.com/en/api/messages-streaming#event-types",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1282,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the key event types included in a raw HTTP stream response when using message streaming, and what is the typical order they occur in?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the key event types included in a raw HTTP stream response when using message streaming, and what is the typical order they occur in?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#event-types\",\"https://docs.claude.com/en/api/messages-streaming#raw-http-stream-response\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#raw-http-stream-response",
|
|
"https://docs.claude.com/en/api/messages-streaming#event-types",
|
|
"https://docs.claude.com/en/api/messages-streaming#basic-streaming-request"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1475,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two key capabilities of Claude that enable it to build interactive systems and personalized user experiences?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two key capabilities of Claude that enable it to build interactive systems and personalized user experiences?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/text-generation#text-capabilities-and-use-cases\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#text-capabilities-and-use-cases",
|
|
"https://docs.claude.com/en/docs/welcome#key-capabilities",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#what-you-can-do-with-claude"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 3452,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is the maximum number of images that can be included in a single request using the Claude API compared to the claude.ai interface?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is the maximum number of images that can be included in a single request using the Claude API compared to the claude.ai interface?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#about-the-prompt-examples\",\"https://docs.claude.com/en/docs/build-with-claude/vision#faq\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size",
|
|
"https://docs.claude.com/en/api/messages-examples#vision",
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#faq"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1502,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is the maximum number of images that can be included in a single request using the Claude API compared to the claude.ai interface?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is the maximum number of images that can be included in a single request using the Claude API compared to the claude.ai interface?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#about-the-prompt-examples\",\"https://docs.claude.com/en/docs/build-with-claude/vision#faq\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/rate-limits#about-our-limits",
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size",
|
|
"https://docs.claude.com/en/api/messages-examples#vision"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1489,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the key event types included in a raw HTTP stream response when using message streaming, and what is the typical order they occur in?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the key event types included in a raw HTTP stream response when using message streaming, and what is the typical order they occur in?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#event-types\",\"https://docs.claude.com/en/api/messages-streaming#raw-http-stream-response\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#raw-http-stream-response",
|
|
"https://docs.claude.com/en/api/messages-streaming#event-types",
|
|
"https://docs.claude.com/en/api/messages-streaming#basic-streaming-request"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 3489,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "When Claude's response is cut off due to hitting the max_tokens limit and contains an incomplete tool use block, what should you do to get the full tool use?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When Claude's response is cut off due to hitting the max_tokens limit and contains an incomplete tool use block, what should you do to get the full tool use?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#pricing",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#next-steps"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1541,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "When Claude's response is cut off due to hitting the max_tokens limit and contains an incomplete tool use block, what should you do to get the full tool use?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When Claude's response is cut off due to hitting the max_tokens limit and contains an incomplete tool use block, what should you do to get the full tool use?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#pricing",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1512,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What two steps are needed before running a classification evaluation on Claude according to the documentation?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What two steps are needed before running a classification evaluation on Claude according to the documentation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#3-run-your-eval\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#2-develop-your-test-cases\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#deploy-your-classifier",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1259,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What two steps are needed before running a classification evaluation on Claude according to the documentation?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What two steps are needed before running a classification evaluation on Claude according to the documentation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#3-run-your-eval\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#2-develop-your-test-cases\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#deploy-your-classifier"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1278,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is the maximum number of images that can be included in a single request using the Claude API compared to the claude.ai interface?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is the maximum number of images that can be included in a single request using the Claude API compared to the claude.ai interface?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#about-the-prompt-examples\",\"https://docs.claude.com/en/docs/build-with-claude/vision#faq\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size",
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#faq",
|
|
"https://docs.claude.com/en/api/messages-examples#vision"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 3939,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What two steps are needed before running a classification evaluation on Claude according to the documentation?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What two steps are needed before running a classification evaluation on Claude according to the documentation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#3-run-your-eval\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#2-develop-your-test-cases\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#deploy-your-classifier"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you use the content parameter in the messages list to influence Claude's response?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you use the content parameter in the messages list to influence Claude's response?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#basic-request-and-response\",\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#advanced-use",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth",
|
|
"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1437,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you use the content parameter in the messages list to influence Claude's response?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you use the content parameter in the messages list to influence Claude's response?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#basic-request-and-response\",\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#controlling-claudes-output",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#advanced-use",
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1462,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two key advantages of prompt engineering over fine-tuning when it comes to model comprehension and general knowledge preservation?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two key advantages of prompt engineering over fine-tuning when it comes to model comprehension and general knowledge preservation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1229,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two key advantages of prompt engineering over fine-tuning when it comes to model comprehension and general knowledge preservation?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two key advantages of prompt engineering over fine-tuning when it comes to model comprehension and general knowledge preservation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1222,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.5 \n Recall: 0.5 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision is 0.5",
|
|
"named_scores": {
|
|
"Precision": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision is 0.5",
|
|
"named_scores": {
|
|
"Precision": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two key advantages of prompt engineering over fine-tuning when it comes to model comprehension and general knowledge preservation?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two key advantages of prompt engineering over fine-tuning when it comes to model comprehension and general knowledge preservation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you use the content parameter in the messages list to influence Claude's response?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you use the content parameter in the messages list to influence Claude's response?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#basic-request-and-response\",\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth",
|
|
"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#how-to-prefill-claudes-response"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 3727,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two main steps to get started with making requests to Claude models on Anthropic's Bedrock API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two main steps to get started with making requests to Claude models on Anthropic's Bedrock API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#install-and-configure-the-aws-cli\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/claude-on-amazon-bedrock#api-model-names",
|
|
"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests",
|
|
"https://docs.claude.com/en/api/claude-on-vertex-ai#model-availability"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1395,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two main steps to get started with making requests to Claude models on Anthropic's Bedrock API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two main steps to get started with making requests to Claude models on Anthropic's Bedrock API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#install-and-configure-the-aws-cli\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/claude-on-amazon-bedrock#accessing-bedrock",
|
|
"https://docs.claude.com/en/docs/welcome#get-started",
|
|
"https://docs.claude.com/en/docs/quickstart#prerequisites"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1426,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "When Claude's response is cut off due to hitting the max_tokens limit and contains an incomplete tool use block, what should you do to get the full tool use?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When Claude's response is cut off due to hitting the max_tokens limit and contains an incomplete tool use block, what should you do to get the full tool use?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#forcing-tool-use"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 7980,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you check which Claude models are available in a specific AWS region using the AWS CLI?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you check which Claude models are available in a specific AWS region using the AWS CLI?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#subscribe-to-anthropic-models\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#list-available-models\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/claude-on-amazon-bedrock#list-available-models",
|
|
"https://docs.claude.com/en/api/claude-on-vertex-ai#model-availability",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#model-options"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1334,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you check which Claude models are available in a specific AWS region using the AWS CLI?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you check which Claude models are available in a specific AWS region using the AWS CLI?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#subscribe-to-anthropic-models\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#list-available-models\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/claude-on-amazon-bedrock#list-available-models",
|
|
"https://docs.claude.com/en/api/claude-on-vertex-ai#model-availability",
|
|
"https://docs.claude.com/en/docs/about-claude/models#model-names"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1312,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What argument can be passed to the voyageai.Client.embed() method or the Voyage HTTP API to specify whether the input text is a query or a document?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What argument can be passed to the voyageai.Client.embed() method or the Voyage HTTP API to specify whether the input text is a query or a document?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-python-package\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-python-package",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1355,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What argument can be passed to the voyageai.Client.embed() method or the Voyage HTTP API to specify whether the input text is a query or a document?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What argument can be passed to the voyageai.Client.embed() method or the Voyage HTTP API to specify whether the input text is a query or a document?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-python-package\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-python-package"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1482,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What argument can be passed to the voyageai.Client.embed() method or the Voyage HTTP API to specify whether the input text is a query or a document?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What argument can be passed to the voyageai.Client.embed() method or the Voyage HTTP API to specify whether the input text is a query or a document?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-python-package\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-python-package"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the two main steps to get started with making requests to Claude models on Anthropic's Bedrock API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the two main steps to get started with making requests to Claude models on Anthropic's Bedrock API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#install-and-configure-the-aws-cli\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/claude-on-amazon-bedrock#subscribe-to-anthropic-models",
|
|
"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests",
|
|
"https://docs.claude.com/en/docs/quickstart#prerequisites"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 4005,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How do the streaming API delta formats differ between tool_use content blocks and text content blocks?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How do the streaming API delta formats differ between tool_use content blocks and text content blocks?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#input-json-delta",
|
|
"https://docs.claude.com/en/api/messages-streaming#text-delta",
|
|
"https://docs.claude.com/en/api/messages-streaming#delta-types"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1537,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How do the streaming API delta formats differ between tool_use content blocks and text content blocks?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How do the streaming API delta formats differ between tool_use content blocks and text content blocks?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#input-json-delta",
|
|
"https://docs.claude.com/en/api/messages-streaming#text-delta",
|
|
"https://docs.claude.com/en/api/messages-streaming#delta-types"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1577,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the image file size limits when uploading images to Claude using the API versus on claude.ai?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the image file size limits when uploading images to Claude using the API versus on claude.ai?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#faq\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#faq",
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size",
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#ensuring-image-quality"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1222,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the image file size limits when uploading images to Claude using the API versus on claude.ai?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the image file size limits when uploading images to Claude using the API versus on claude.ai?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#faq\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#faq",
|
|
"https://docs.claude.com/en/api/rate-limits#about-our-limits",
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#ensuring-image-quality"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1250,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the image file size limits when uploading images to Claude using the API versus on claude.ai?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the image file size limits when uploading images to Claude using the API versus on claude.ai?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#faq\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#faq",
|
|
"https://docs.claude.com/en/api/rate-limits#about-our-limits",
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#ensuring-image-quality"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How do the streaming API delta formats differ between tool_use content blocks and text content blocks?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How do the streaming API delta formats differ between tool_use content blocks and text content blocks?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-streaming#input-json-delta",
|
|
"https://docs.claude.com/en/api/messages-streaming#text-delta",
|
|
"https://docs.claude.com/en/api/messages-streaming#delta-types"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 3352,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is one key consideration when selecting a Claude model for an enterprise use case that needs low latency?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is one key consideration when selecting a Claude model for an enterprise use case that needs low latency?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/intro-to-claude#model-options\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/intro-to-claude#model-options",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1295,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is one key consideration when selecting a Claude model for an enterprise use case that needs low latency?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is one key consideration when selecting a Claude model for an enterprise use case that needs low latency?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/intro-to-claude#model-options\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#model-options",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1368,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What embedding model does Anthropic recommend for code retrieval, and how does its performance compare to alternatives according to Voyage AI?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What embedding model does Anthropic recommend for code retrieval, and how does its performance compare to alternatives according to Voyage AI?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1291,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What embedding model does Anthropic recommend for code retrieval, and how does its performance compare to alternatives according to Voyage AI?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What embedding model does Anthropic recommend for code retrieval, and how does its performance compare to alternatives according to Voyage AI?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1295,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What embedding model does Anthropic recommend for code retrieval, and how does its performance compare to alternatives according to Voyage AI?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What embedding model does Anthropic recommend for code retrieval, and how does its performance compare to alternatives according to Voyage AI?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic",
|
|
"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you check which Claude models are available in a specific AWS region using the AWS CLI?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you check which Claude models are available in a specific AWS region using the AWS CLI?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#subscribe-to-anthropic-models\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#list-available-models\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/claude-on-amazon-bedrock#list-available-models",
|
|
"https://docs.claude.com/en/api/claude-on-amazon-bedrock#api-model-names",
|
|
"https://docs.claude.com/en/api/claude-on-vertex-ai#model-availability"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 8622,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two ways the Claude Cookbook can help developers learn to use Anthropic's APIs?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two ways the Claude Cookbook can help developers learn to use Anthropic's APIs?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/welcome#develop-with-claude\",\"https://docs.claude.com/en/docs/quickstart#next-steps\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/welcome#develop-with-claude",
|
|
"https://docs.claude.com/en/docs/quickstart#next-steps",
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1373,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two ways the Claude Cookbook can help developers learn to use Anthropic's APIs?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two ways the Claude Cookbook can help developers learn to use Anthropic's APIs?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/welcome#develop-with-claude\",\"https://docs.claude.com/en/docs/quickstart#next-steps\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/quickstart#next-steps",
|
|
"https://docs.claude.com/en/api/#accessing-the-api",
|
|
"https://docs.claude.com/en/docs/welcome#develop-with-claude"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1494,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is one key consideration when selecting a Claude model for an enterprise use case that needs low latency?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is one key consideration when selecting a Claude model for an enterprise use case that needs low latency?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/intro-to-claude#model-options\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#choosing-the-right-model",
|
|
"https://docs.claude.com/en/docs/about-claude/models#model-comparison"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 3757,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does the size of the context window impact a language model's ability to utilize retrieval augmented generation (RAG)?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does the size of the context window impact a language model's ability to utilize retrieval augmented generation (RAG)?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#context-window\",\"https://docs.claude.com/en/docs/resources/glossary#rag-retrieval-augmented-generation\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/resources/glossary#rag-retrieval-augmented-generation",
|
|
"https://docs.claude.com/en/docs/resources/glossary#context-window",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#use-retrieval-for-contextual-consistency"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1607,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does the size of the context window impact a language model's ability to utilize retrieval augmented generation (RAG)?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does the size of the context window impact a language model's ability to utilize retrieval augmented generation (RAG)?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#context-window\",\"https://docs.claude.com/en/docs/resources/glossary#rag-retrieval-augmented-generation\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/resources/glossary#rag-retrieval-augmented-generation",
|
|
"https://docs.claude.com/en/docs/resources/glossary#context-window",
|
|
"https://docs.claude.com/en/docs/resources/glossary#tokens"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1554,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can the Evaluation tool in Anthropic's Claude platform help improve prompts and build more robust AI applications?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can the Evaluation tool in Anthropic's Claude platform help improve prompts and build more robust AI applications?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\",\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#creating-test-cases\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#implementing-claude",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#why-use-claude-for-sheets"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1316,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can the Evaluation tool in Anthropic's Claude platform help improve prompts and build more robust AI applications?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can the Evaluation tool in Anthropic's Claude platform help improve prompts and build more robust AI applications?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\",\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#creating-test-cases\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial",
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1334,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can the Evaluation tool in Anthropic's Claude platform help improve prompts and build more robust AI applications?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can the Evaluation tool in Anthropic's Claude platform help improve prompts and build more robust AI applications?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\",\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#creating-test-cases\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial",
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two ways the Claude Cookbook can help developers learn to use Anthropic's APIs?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two ways the Claude Cookbook can help developers learn to use Anthropic's APIs?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/welcome#develop-with-claude\",\"https://docs.claude.com/en/docs/quickstart#next-steps\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook",
|
|
"https://docs.claude.com/en/docs/quickstart#next-steps",
|
|
"https://docs.claude.com/en/docs/welcome#develop-with-claude"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 4931,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "Which Claude model has the fastest comparative latency according to the comparison tables?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "Which Claude model has the fastest comparative latency according to the comparison tables?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\",\"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/models#model-comparison",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification",
|
|
"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1229,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "Which Claude model has the fastest comparative latency according to the comparison tables?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "Which Claude model has the fastest comparative latency according to the comparison tables?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\",\"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/models#model-comparison",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification",
|
|
"https://docs.claude.com/en/docs/welcome#models"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1253,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "Which Claude model has the fastest comparative latency according to the comparison tables?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "Which Claude model has the fastest comparative latency according to the comparison tables?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\",\"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/models#model-comparison",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification",
|
|
"https://docs.claude.com/en/docs/welcome#models"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you build up a conversation with multiple turns using the Anthropic Messages API in Python?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you build up a conversation with multiple turns using the Anthropic Messages API in Python?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/client-sdks#python\",\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns",
|
|
"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth",
|
|
"https://docs.claude.com/en/api/client-sdks#python"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1414,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you build up a conversation with multiple turns using the Anthropic Messages API in Python?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you build up a conversation with multiple turns using the Anthropic Messages API in Python?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/client-sdks#python\",\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns",
|
|
"https://docs.claude.com/en/api/client-sdks#python",
|
|
"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1410,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can using XML tags to provide a specific role or context help improve Claude's analysis of a legal contract compared to not using a role prompt?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can using XML tags to provide a specific role or context help improve Claude's analysis of a legal contract compared to not using a role prompt?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#examples\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-1-legal-contract-analysis\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#why-use-role-prompting",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#why-use-xml-tags",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-1-legal-contract-analysis"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1307,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can using XML tags to provide a specific role or context help improve Claude's analysis of a legal contract compared to not using a role prompt?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can using XML tags to provide a specific role or context help improve Claude's analysis of a legal contract compared to not using a role prompt?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#examples\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-1-legal-contract-analysis\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#why-use-role-prompting",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#why-use-xml-tags",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-1-legal-contract-analysis"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1296,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can using XML tags to provide a specific role or context help improve Claude's analysis of a legal contract compared to not using a role prompt?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can using XML tags to provide a specific role or context help improve Claude's analysis of a legal contract compared to not using a role prompt?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#examples\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-1-legal-contract-analysis\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#why-use-role-prompting",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#why-use-xml-tags",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-1-legal-contract-analysis"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the key differences between how Claude 3 Opus and Claude 3 Sonnet handle missing information when making tool calls?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the key differences between how Claude 3 Opus and Claude 3 Sonnet handle missing information when making tool calls?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#chain-of-thought\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#tool-use-examples\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#choosing-a-model",
|
|
"https://docs.claude.com/en/docs/about-claude/models#model-comparison",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#claude-3-family"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1248,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the key differences between how Claude 3 Opus and Claude 3 Sonnet handle missing information when making tool calls?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the key differences between how Claude 3 Opus and Claude 3 Sonnet handle missing information when making tool calls?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#chain-of-thought\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#tool-use-examples\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#choosing-a-model",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#claude-3-family",
|
|
"https://docs.claude.com/en/docs/about-claude/models#model-comparison"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1328,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the key differences between how Claude 3 Opus and Claude 3 Sonnet handle missing information when making tool calls?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the key differences between how Claude 3 Opus and Claude 3 Sonnet handle missing information when making tool calls?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#chain-of-thought\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#tool-use-examples\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#choosing-a-model",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#claude-3-family",
|
|
"https://docs.claude.com/en/docs/about-claude/models#model-comparison"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does the size of the context window impact a language model's ability to utilize retrieval augmented generation (RAG)?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does the size of the context window impact a language model's ability to utilize retrieval augmented generation (RAG)?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#context-window\",\"https://docs.claude.com/en/docs/resources/glossary#rag-retrieval-augmented-generation\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/resources/glossary#rag-retrieval-augmented-generation",
|
|
"https://docs.claude.com/en/docs/resources/glossary#context-window",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#2-optimize-prompt-and-output-length"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 9206,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you build up a conversation with multiple turns using the Anthropic Messages API in Python?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you build up a conversation with multiple turns using the Anthropic Messages API in Python?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/client-sdks#python\",\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns",
|
|
"https://docs.claude.com/en/api/client-sdks#python",
|
|
"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 5221,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What steps should be taken to ensure a reliable deployment of an automated ticket routing system using Claude into a production environment?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What steps should be taken to ensure a reliable deployment of an automated ticket routing system using Claude into a production environment?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#introduction"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1513,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What steps should be taken to ensure a reliable deployment of an automated ticket routing system using Claude into a production environment?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What steps should be taken to ensure a reliable deployment of an automated ticket routing system using Claude into a production environment?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#introduction"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1449,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How should you evaluate a model's performance on a ticket routing classifier?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How should you evaluate a model's performance on a ticket routing classifier?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1315,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What two methods does Anthropic recommend for learning how to prompt engineer with Claude before diving into the techniques?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What two methods does Anthropic recommend for learning how to prompt engineer with Claude before diving into the techniques?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1085,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What two methods does Anthropic recommend for learning how to prompt engineer with Claude before diving into the techniques?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What two methods does Anthropic recommend for learning how to prompt engineer with Claude before diving into the techniques?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What two methods does Anthropic recommend for learning how to prompt engineer with Claude before diving into the techniques?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What two methods does Anthropic recommend for learning how to prompt engineer with Claude before diving into the techniques?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What steps should be taken to ensure a reliable deployment of an automated ticket routing system using Claude into a production environment?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What steps should be taken to ensure a reliable deployment of an automated ticket routing system using Claude into a production environment?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#defining-the-task"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 3533,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How should you evaluate a model's performance on a ticket routing classifier?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How should you evaluate a model's performance on a ticket routing classifier?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 3437,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the key differences between a pretrained large language model and Claude in terms of their training and capabilities?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the key differences between a pretrained large language model and Claude in terms of their training and capabilities?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#llm\",\"https://docs.claude.com/en/docs/resources/glossary#pretraining\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/welcome#models",
|
|
"https://docs.claude.com/en/docs/resources/glossary#pretraining",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#advantages-of-using-claude"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1370,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the key differences between a pretrained large language model and Claude in terms of their training and capabilities?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the key differences between a pretrained large language model and Claude in terms of their training and capabilities?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#llm\",\"https://docs.claude.com/en/docs/resources/glossary#pretraining\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/welcome#models",
|
|
"https://docs.claude.com/en/docs/resources/glossary#pretraining",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#advantages-of-using-claude"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1219,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are some key advantages of using prompt engineering instead of fine-tuning to adapt a pretrained language model for a specific task or domain?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are some key advantages of using prompt engineering instead of fine-tuning to adapt a pretrained language model for a specific task or domain?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#pretraining\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer",
|
|
"https://docs.claude.com/en/docs/resources/glossary#pretraining",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.6666666666666666,
|
|
"namedScores": {},
|
|
"latencyMs": 1375,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 0.6666666666666666 \n F1 Score: 0.6666666666666666 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are some key advantages of using prompt engineering instead of fine-tuning to adapt a pretrained language model for a specific task or domain?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are some key advantages of using prompt engineering instead of fine-tuning to adapt a pretrained language model for a specific task or domain?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#pretraining\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer",
|
|
"https://docs.claude.com/en/docs/resources/glossary#pretraining",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.6666666666666666,
|
|
"namedScores": {},
|
|
"latencyMs": 1393,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 0.6666666666666666 \n F1 Score: 0.6666666666666666 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are some key advantages of using prompt engineering instead of fine-tuning to adapt a pretrained language model for a specific task or domain?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are some key advantages of using prompt engineering instead of fine-tuning to adapt a pretrained language model for a specific task or domain?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#pretraining\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer",
|
|
"https://docs.claude.com/en/docs/resources/glossary#pretraining",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.6666666666666666,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 0.6666666666666666 \n F1 Score: 0.6666666666666666 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How should you evaluate a model's performance on a ticket routing classifier?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How should you evaluate a model's performance on a ticket routing classifier?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 6594,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the key differences between a pretrained large language model and Claude in terms of their training and capabilities?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the key differences between a pretrained large language model and Claude in terms of their training and capabilities?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#llm\",\"https://docs.claude.com/en/docs/resources/glossary#pretraining\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/resources/glossary#pretraining",
|
|
"https://docs.claude.com/en/docs/resources/glossary#llm",
|
|
"https://docs.claude.com/en/docs/resources/glossary#rlhf"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 3803,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you authenticate with GCP before running requests to access Claude models on Vertex AI?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you authenticate with GCP before running requests to access Claude models on Vertex AI?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-vertex-ai#making-requests\",\"https://docs.claude.com/en/api/claude-on-vertex-ai#accessing-vertex-ai\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/claude-on-vertex-ai#accessing-vertex-ai",
|
|
"https://docs.claude.com/en/api/claude-on-vertex-ai#making-requests",
|
|
"https://docs.claude.com/en/docs/about-claude/models#get-started-with-claude"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1621,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you authenticate with GCP before running requests to access Claude models on Vertex AI?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you authenticate with GCP before running requests to access Claude models on Vertex AI?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-vertex-ai#making-requests\",\"https://docs.claude.com/en/api/claude-on-vertex-ai#accessing-vertex-ai\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/claude-on-vertex-ai#making-requests",
|
|
"https://docs.claude.com/en/docs/about-claude/models#get-started-with-claude",
|
|
"https://docs.claude.com/en/api/claude-on-vertex-ai#api-model-names"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1721,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What new capabilities and features were introduced by Anthropic on May 10th, 2024 and how do they enable users to create and tailor prompts for specific tasks?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What new capabilities and features were introduced by Anthropic on May 10th, 2024 and how do they enable users to create and tailor prompts for specific tasks?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#may-10th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources",
|
|
"https://docs.claude.com/en/docs/quickstart#next-steps",
|
|
"https://docs.claude.com/en/release-notes/api#may-10th-2024"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1471,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What new capabilities and features were introduced by Anthropic on May 10th, 2024 and how do they enable users to create and tailor prompts for specific tasks?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What new capabilities and features were introduced by Anthropic on May 10th, 2024 and how do they enable users to create and tailor prompts for specific tasks?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#may-10th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/release-notes/api#may-10th-2024",
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources",
|
|
"https://docs.claude.com/en/docs/quickstart#next-steps"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1466,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How can you authenticate with GCP before running requests to access Claude models on Vertex AI?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How can you authenticate with GCP before running requests to access Claude models on Vertex AI?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-vertex-ai#making-requests\",\"https://docs.claude.com/en/api/claude-on-vertex-ai#accessing-vertex-ai\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/claude-on-vertex-ai#making-requests",
|
|
"https://docs.claude.com/en/api/claude-on-vertex-ai#install-an-sdk-for-accessing-vertex-ai",
|
|
"https://docs.claude.com/en/api/claude-on-vertex-ai#model-availability"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 3309,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "On what date did both the Claude 3.5 Sonnet model and the Artifacts feature in Claude.ai become available?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "On what date did both the Claude 3.5 Sonnet model and the Artifacts feature in Claude.ai become available?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#june-20th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#june-20th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/release-notes/claude-apps#june-20th-2024",
|
|
"https://docs.claude.com/en/release-notes/api#june-20th-2024",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#claude-3-5-family"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1367,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "On what date did both the Claude 3.5 Sonnet model and the Artifacts feature in Claude.ai become available?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "On what date did both the Claude 3.5 Sonnet model and the Artifacts feature in Claude.ai become available?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#june-20th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#june-20th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/release-notes/claude-apps#june-20th-2024",
|
|
"https://docs.claude.com/en/release-notes/api#june-20th-2024",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#claude-3-5-family"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1388,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What new capabilities and features were introduced by Anthropic on May 10th, 2024 and how do they enable users to create and tailor prompts for specific tasks?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What new capabilities and features were introduced by Anthropic on May 10th, 2024 and how do they enable users to create and tailor prompts for specific tasks?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#may-10th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/release-notes/api#may-10th-2024",
|
|
"https://docs.claude.com/en/docs/welcome#develop-with-claude",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#before-prompt-engineering"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 3911,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "When putting words in Claude's mouth to shape the response, what header and value can you use in the request to limit Claude's response to a single token?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When putting words in Claude's mouth to shape the response, what header and value can you use in the request to limit Claude's response to a single token?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#basic-request-and-response\",\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#controlling-claudes-output",
|
|
"https://docs.claude.com/en/docs/resources/glossary#tokens"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1305,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "When putting words in Claude's mouth to shape the response, what header and value can you use in the request to limit Claude's response to a single token?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When putting words in Claude's mouth to shape the response, what header and value can you use in the request to limit Claude's response to a single token?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#basic-request-and-response\",\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth",
|
|
"https://docs.claude.com/en/api/messages-examples#basic-request-and-response",
|
|
"https://docs.claude.com/en/docs/resources/glossary#tokens"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 1243,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What does the temperature parameter do when working with large language models?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What does the temperature parameter do when working with large language models?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#temperature\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#2-optimize-prompt-and-output-length\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/resources/glossary#temperature",
|
|
"https://docs.claude.com/en/docs/welcome#models",
|
|
"https://docs.claude.com/en/docs/resources/glossary#tokens"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1355,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What does the temperature parameter do when working with large language models?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What does the temperature parameter do when working with large language models?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#temperature\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#2-optimize-prompt-and-output-length\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/resources/glossary#temperature",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#controlling-claudes-output",
|
|
"https://docs.claude.com/en/docs/welcome#models"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1363,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What does the temperature parameter do when working with large language models?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What does the temperature parameter do when working with large language models?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#temperature\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#2-optimize-prompt-and-output-length\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/resources/glossary#temperature",
|
|
"https://docs.claude.com/en/docs/build-with-claude/tool-use#controlling-claudes-output",
|
|
"https://docs.claude.com/en/docs/welcome#models"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "On what date did both the Claude 3.5 Sonnet model and the Artifacts feature in Claude.ai become available?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "On what date did both the Claude 3.5 Sonnet model and the Artifacts feature in Claude.ai become available?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#june-20th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#june-20th-2024\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/release-notes/claude-apps#june-20th-2024",
|
|
"https://docs.claude.com/en/release-notes/api#june-20th-2024",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#claude-3-5-family"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"latencyMs": 3867,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two ways to specify API parameters when calling the Claude API using Claude for Sheets?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two ways to specify API parameters when calling the Claude API using Claude for Sheets?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#tips-for-effective-evaluation\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#how-to-prefill-claudes-response\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#enter-your-first-prompt\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#claude-for-sheets-usage-examples",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#get-started-with-claude-for-sheets",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#enter-your-first-prompt"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.3333333333333333,
|
|
"namedScores": {},
|
|
"latencyMs": 1675,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.3333333333333333 \n F1 Score: 0.3333333333333333 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Recall is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Recall": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "F1 is 0.3333333333333333",
|
|
"named_scores": {
|
|
"F1": 0.3333333333333333
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Recall is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Recall": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "F1 is 0.3333333333333333",
|
|
"named_scores": {
|
|
"F1": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two ways to specify API parameters when calling the Claude API using Claude for Sheets?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two ways to specify API parameters when calling the Claude API using Claude for Sheets?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#tips-for-effective-evaluation\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#how-to-prefill-claudes-response\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#enter-your-first-prompt\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#get-started-with-claude-for-sheets",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#enter-your-first-prompt",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#optional-function-parameters"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.3333333333333333,
|
|
"namedScores": {},
|
|
"latencyMs": 1635,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.3333333333333333 \n F1 Score: 0.3333333333333333 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Recall is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Recall": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "F1 is 0.3333333333333333",
|
|
"named_scores": {
|
|
"F1": 0.3333333333333333
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Recall is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Recall": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "F1 is 0.3333333333333333",
|
|
"named_scores": {
|
|
"F1": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "When putting words in Claude's mouth to shape the response, what header and value can you use in the request to limit Claude's response to a single token?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When putting words in Claude's mouth to shape the response, what header and value can you use in the request to limit Claude's response to a single token?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#basic-request-and-response\",\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth",
|
|
"https://docs.claude.com/en/api/rate-limits#rate-limits",
|
|
"https://docs.claude.com/en/docs/resources/glossary#tokens"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 4162,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does prefilling the response with an opening curly brace ({ ) affect Claude's output when extracting structured data from text?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does prefilling the response with an opening curly brace ({ ) affect Claude's output when extracting structured data from text?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#example-1-controlling-output-formatting-and-skipping-the-preamble\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#example-1-controlling-output-formatting-and-skipping-the-preamble",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#prefill-claudes-response",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#how-to-prefill-claudes-response"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1293,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does prefilling the response with an opening curly brace ({ ) affect Claude's output when extracting structured data from text?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does prefilling the response with an opening curly brace ({ ) affect Claude's output when extracting structured data from text?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#example-1-controlling-output-formatting-and-skipping-the-preamble\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#example-1-controlling-output-formatting-and-skipping-the-preamble",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#how-to-prefill-claudes-response",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#prefill-claudes-response"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 1347,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two ways to specify API parameters when calling the Claude API using Claude for Sheets?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two ways to specify API parameters when calling the Claude API using Claude for Sheets?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#tips-for-effective-evaluation\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#how-to-prefill-claudes-response\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#enter-your-first-prompt\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#enter-your-first-prompt",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#optional-function-parameters",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#advanced-use"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.3333333333333333,
|
|
"namedScores": {},
|
|
"latencyMs": 3207,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.3333333333333333 \n F1 Score: 0.3333333333333333 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Recall is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Recall": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "F1 is 0.3333333333333333",
|
|
"named_scores": {
|
|
"F1": 0.3333333333333333
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Recall is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Recall": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "F1 is 0.3333333333333333",
|
|
"named_scores": {
|
|
"F1": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are some helpful resources provided by Anthropic to dive deeper into building with images using Claude?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are some helpful resources provided by Anthropic to dive deeper into building with images using Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#dive-deeper-into-vision\",\"https://docs.claude.com/en/docs/build-with-claude/vision#about-the-prompt-examples\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#prompt-examples",
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#dive-deeper-into-vision",
|
|
"https://docs.claude.com/en/docs/welcome#develop-with-claude"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1487,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are some helpful resources provided by Anthropic to dive deeper into building with images using Claude?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are some helpful resources provided by Anthropic to dive deeper into building with images using Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#dive-deeper-into-vision\",\"https://docs.claude.com/en/docs/build-with-claude/vision#about-the-prompt-examples\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#dive-deeper-into-vision",
|
|
"https://docs.claude.com/en/docs/intro-to-claude#start-building-with-claude",
|
|
"https://docs.claude.com/en/docs/welcome#develop-with-claude"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1530,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "How do you specify the API key when creating a new Anthropic client in the Python and TypeScript SDK examples?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How do you specify the API key when creating a new Anthropic client in the Python and TypeScript SDK examples?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/client-sdks#typescript\",\"https://docs.claude.com/en/api/client-sdks#python\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/quickstart#set-your-api-key",
|
|
"https://docs.claude.com/en/docs/quickstart#prerequisites",
|
|
"https://docs.claude.com/en/api/#authentication"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1355,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "How do you specify the API key when creating a new Anthropic client in the Python and TypeScript SDK examples?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How do you specify the API key when creating a new Anthropic client in the Python and TypeScript SDK examples?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/client-sdks#typescript\",\"https://docs.claude.com/en/api/client-sdks#python\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/#authentication",
|
|
"https://docs.claude.com/en/docs/quickstart#set-your-api-key",
|
|
"https://docs.claude.com/en/api/client-sdks#typescript"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1357,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How do you specify the API key when creating a new Anthropic client in the Python and TypeScript SDK examples?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How do you specify the API key when creating a new Anthropic client in the Python and TypeScript SDK examples?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/client-sdks#typescript\",\"https://docs.claude.com/en/api/client-sdks#python\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/#authentication",
|
|
"https://docs.claude.com/en/docs/quickstart#set-your-api-key",
|
|
"https://docs.claude.com/en/api/client-sdks#typescript"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "How does prefilling the response with an opening curly brace ({ ) affect Claude's output when extracting structured data from text?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "How does prefilling the response with an opening curly brace ({ ) affect Claude's output when extracting structured data from text?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#example-1-controlling-output-formatting-and-skipping-the-preamble\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#example-1-controlling-output-formatting-and-skipping-the-preamble",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#prefill-claudes-response",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#how-to-prefill-claudes-response"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"latencyMs": 4071,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two key benefits of using the Anthropic Evaluation tool when developing prompts for an AI classification application?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two key benefits of using the Anthropic Evaluation tool when developing prompts for an AI classification application?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#2-develop-your-test-cases\",\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results",
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#before-prompt-engineering",
|
|
"https://docs.claude.com/en/docs/resources/glossary#hhh"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1538,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two key benefits of using the Anthropic Evaluation tool when developing prompts for an AI classification application?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two key benefits of using the Anthropic Evaluation tool when developing prompts for an AI classification application?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#2-develop-your-test-cases\",\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#before-prompt-engineering",
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results",
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1545,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are some helpful resources provided by Anthropic to dive deeper into building with images using Claude?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are some helpful resources provided by Anthropic to dive deeper into building with images using Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#dive-deeper-into-vision\",\"https://docs.claude.com/en/docs/build-with-claude/vision#about-the-prompt-examples\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/build-with-claude/vision#dive-deeper-into-vision",
|
|
"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook",
|
|
"https://docs.claude.com/en/api/messages-examples#vision"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 5134,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the key differences between a pretrained language model like Claude's underlying model, and the final version of Claude available through Anthropic's API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the key differences between a pretrained language model like Claude's underlying model, and the final version of Claude available through Anthropic's API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#pretraining\",\"https://docs.claude.com/en/docs/resources/glossary#llm\",\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/welcome#models",
|
|
"https://docs.claude.com/en/docs/about-claude/models#model-comparison",
|
|
"https://docs.claude.com/en/docs/resources/glossary#pretraining"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.3333333333333333,
|
|
"namedScores": {},
|
|
"latencyMs": 1350,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.3333333333333333 \n F1 Score: 0.3333333333333333 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Recall is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Recall": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "F1 is 0.3333333333333333",
|
|
"named_scores": {
|
|
"F1": 0.3333333333333333
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Recall is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Recall": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "F1 is 0.3333333333333333",
|
|
"named_scores": {
|
|
"F1": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the key differences between a pretrained language model like Claude's underlying model, and the final version of Claude available through Anthropic's API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the key differences between a pretrained language model like Claude's underlying model, and the final version of Claude available through Anthropic's API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#pretraining\",\"https://docs.claude.com/en/docs/resources/glossary#llm\",\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/welcome#models",
|
|
"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#advantages-of-using-claude"
|
|
]
|
|
},
|
|
"success": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"latencyMs": 1310,
|
|
"error": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are two key benefits of using the Anthropic Evaluation tool when developing prompts for an AI classification application?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are two key benefits of using the Anthropic Evaluation tool when developing prompts for an AI classification application?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#2-develop-your-test-cases\",\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results",
|
|
"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#advantages-of-using-claude",
|
|
"https://docs.claude.com/en/docs/about-claude/models#prompt-and-output-performance"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 3868,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is the IPv6 address range used by Anthropic?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is the IPv6 address range used by Anthropic?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/ip-addresses#ipv6\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/ip-addresses#ipv6"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"latencyMs": 1303,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is the IPv6 address range used by Anthropic?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is the IPv6 address range used by Anthropic?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/ip-addresses#ipv6\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/ip-addresses#ipv6"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"latencyMs": 1362,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What is the IPv6 address range used by Anthropic?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What is the IPv6 address range used by Anthropic?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/ip-addresses#ipv6\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/ip-addresses#ipv6"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_base"
|
|
},
|
|
"prompt": {
|
|
"raw": "When using the Python SDK to create a message with Claude, what are two ways you can specify your API key?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When using the Python SDK to create a message with Claude, what are two ways you can specify your API key?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\",\"https://docs.claude.com/en/api/client-sdks#python\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/api/client-sdks#python",
|
|
"https://docs.claude.com/en/docs/quickstart#call-the-api",
|
|
"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#advanced-use"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1533,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_two"
|
|
},
|
|
"prompt": {
|
|
"raw": "When using the Python SDK to create a message with Claude, what are two ways you can specify your API key?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When using the Python SDK to create a message with Claude, what are two ways you can specify your API key?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\",\"https://docs.claude.com/en/api/client-sdks#python\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/quickstart#set-your-api-key",
|
|
"https://docs.claude.com/en/api/client-sdks#python",
|
|
"https://docs.claude.com/en/api/client-sdks#typescript"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 1535,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "What are the key differences between a pretrained language model like Claude's underlying model, and the final version of Claude available through Anthropic's API?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "What are the key differences between a pretrained language model like Claude's underlying model, and the final version of Claude available through Anthropic's API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#pretraining\",\"https://docs.claude.com/en/docs/resources/glossary#llm\",\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/resources/glossary#pretraining",
|
|
"https://docs.claude.com/en/docs/resources/glossary#fine-tuning",
|
|
"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.6666666666666666,
|
|
"namedScores": {},
|
|
"latencyMs": 3517,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 0.6666666666666666 \n F1 Score: 0.6666666666666666 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
},
|
|
{
|
|
"provider": {
|
|
"id": "python:provider_retrieval.py:retrieve_level_three"
|
|
},
|
|
"prompt": {
|
|
"raw": "When using the Python SDK to create a message with Claude, what are two ways you can specify your API key?",
|
|
"label": "{{ query }}"
|
|
},
|
|
"vars": {
|
|
"query": "When using the Python SDK to create a message with Claude, what are two ways you can specify your API key?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\",\"https://docs.claude.com/en/api/client-sdks#python\"]"
|
|
},
|
|
"response": {
|
|
"output": [
|
|
"https://docs.claude.com/en/docs/quickstart#set-your-api-key",
|
|
"https://docs.claude.com/en/api/client-sdks#python",
|
|
"https://docs.claude.com/en/docs/quickstart#call-the-api"
|
|
]
|
|
},
|
|
"success": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"latencyMs": 4568,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
}
|
|
}
|
|
],
|
|
"stats": {
|
|
"successes": 265,
|
|
"failures": 35,
|
|
"tokenUsage": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0,
|
|
"cached": 0
|
|
}
|
|
},
|
|
"table": {
|
|
"head": {
|
|
"prompts": [
|
|
{
|
|
"raw": "{{ query }}",
|
|
"label": "{{ query }}",
|
|
"id": "66df46182523d473823ff69a7aad542f6c788ca2e644faeeacbe483c9482be41",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"metrics": {
|
|
"score": 50.02380952380948,
|
|
"testPassCount": 83,
|
|
"testFailCount": 17,
|
|
"assertPassCount": 479,
|
|
"assertFailCount": 16,
|
|
"totalLatencyMs": 351264,
|
|
"tokenUsage": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0,
|
|
"cached": 0
|
|
},
|
|
"namedScores": {},
|
|
"cost": 0
|
|
}
|
|
},
|
|
{
|
|
"raw": "{{ query }}",
|
|
"label": "{{ query }}",
|
|
"id": "66df46182523d473823ff69a7aad542f6c788ca2e644faeeacbe483c9482be41",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"metrics": {
|
|
"score": 53.823809523809466,
|
|
"testPassCount": 90,
|
|
"testFailCount": 10,
|
|
"assertPassCount": 490,
|
|
"assertFailCount": 10,
|
|
"totalLatencyMs": 228464,
|
|
"tokenUsage": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0,
|
|
"cached": 0
|
|
},
|
|
"namedScores": {},
|
|
"cost": 0
|
|
}
|
|
},
|
|
{
|
|
"raw": "{{ query }}",
|
|
"label": "{{ query }}",
|
|
"id": "66df46182523d473823ff69a7aad542f6c788ca2e644faeeacbe483c9482be41",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"metrics": {
|
|
"score": 54.02380952380946,
|
|
"testPassCount": 92,
|
|
"testFailCount": 8,
|
|
"assertPassCount": 492,
|
|
"assertFailCount": 8,
|
|
"totalLatencyMs": 344289,
|
|
"tokenUsage": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0,
|
|
"cached": 0
|
|
},
|
|
"namedScores": {},
|
|
"cost": 0
|
|
}
|
|
}
|
|
],
|
|
"vars": [
|
|
"correct_chunks",
|
|
"query"
|
|
]
|
|
},
|
|
"body": [
|
|
{
|
|
"description": "Row #1",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#building-evals-and-test-cases\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#2-develop-your-test-cases\"]",
|
|
"prompt": "How can you create multiple test cases for an evaluation in the Anthropic Evaluation tool?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 106010,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#creating-test-cases\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#2-develop-your-test-cases\",\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\"]",
|
|
"prompt": "How can you create multiple test cases for an evaluation in the Anthropic Evaluation tool?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 96901,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#creating-test-cases\",\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#accessing-the-evaluate-feature\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#2-develop-your-test-cases\"]",
|
|
"prompt": "How can you create multiple test cases for an evaluation in the Anthropic Evaluation tool?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 107527,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can you create multiple test cases for an evaluation in the Anthropic Evaluation tool?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#creating-test-cases\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#building-evals-and-test-cases\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #1"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#creating-test-cases\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#building-evals-and-test-cases\"]",
|
|
"How can you create multiple test cases for an evaluation in the Anthropic Evaluation tool?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #2",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#before-implementing-embeddings\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models\"]",
|
|
"prompt": "What embeddings provider does Anthropic recommend for customized domain-specific models, and what capabilities does this provider offer?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 110531,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#before-implementing-embeddings\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models\"]",
|
|
"prompt": "What embeddings provider does Anthropic recommend for customized domain-specific models, and what capabilities does this provider offer?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1011,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#before-implementing-embeddings\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models\"]",
|
|
"prompt": "What embeddings provider does Anthropic recommend for customized domain-specific models, and what capabilities does this provider offer?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What embeddings provider does Anthropic recommend for customized domain-specific models, and what capabilities does this provider offer?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#before-implementing-embeddings\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #2"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#before-implementing-embeddings\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic\"]",
|
|
"What embeddings provider does Anthropic recommend for customized domain-specific models, and what capabilities does this provider offer?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #3",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#choosing-the-right-model\"]",
|
|
"prompt": "What are some key success metrics to consider when evaluating Claude's performance on a classification task, and how do they relate to choosing the right model to reduce latency?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 957,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#choosing-the-right-model\"]",
|
|
"prompt": "What are some key success metrics to consider when evaluating Claude's performance on a classification task, and how do they relate to choosing the right model to reduce latency?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#choosing-the-right-model\"]",
|
|
"prompt": "What are some key success metrics to consider when evaluating Claude's performance on a classification task, and how do they relate to choosing the right model to reduce latency?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are some key success metrics to consider when evaluating Claude's performance on a classification task, and how do they relate to choosing the right model to reduce latency?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #3"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\"]",
|
|
"What are some key success metrics to consider when evaluating Claude's performance on a classification task, and how do they relate to choosing the right model to reduce latency?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #4",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#why-use-claude-for-sheets\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial\"]",
|
|
"prompt": "What are two ways that Claude for Sheets can improve prompt engineering workflows compared to using chained prompts?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 975,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#why-use-claude-for-sheets\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial\"]",
|
|
"prompt": "What are two ways that Claude for Sheets can improve prompt engineering workflows compared to using chained prompts?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#why-use-claude-for-sheets\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial\"]",
|
|
"prompt": "What are two ways that Claude for Sheets can improve prompt engineering workflows compared to using chained prompts?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are two ways that Claude for Sheets can improve prompt engineering workflows compared to using chained prompts?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#why-use-claude-for-sheets\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #4"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#why-use-claude-for-sheets\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts\"]",
|
|
"What are two ways that Claude for Sheets can improve prompt engineering workflows compared to using chained prompts?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #5",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/prompt-validation#examples\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth\"]",
|
|
"prompt": "What happens if a prompt for the Text Completions API is missing the \"\\n\\nHuman:\" and \"\\n\\nAssistant:\" turns?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1007,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/prompt-validation#examples\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth\"]",
|
|
"prompt": "What happens if a prompt for the Text Completions API is missing the \"\\n\\nHuman:\" and \"\\n\\nAssistant:\" turns?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/prompt-validation#examples\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth\"]",
|
|
"prompt": "What happens if a prompt for the Text Completions API is missing the \"\\n\\nHuman:\" and \"\\n\\nAssistant:\" turns?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What happens if a prompt for the Text Completions API is missing the \"\\n\\nHuman:\" and \"\\n\\nAssistant:\" turns?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt\",\"https://docs.claude.com/en/api/prompt-validation#examples\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #5"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt\",\"https://docs.claude.com/en/api/prompt-validation#examples\"]",
|
|
"What happens if a prompt for the Text Completions API is missing the \"\\n\\nHuman:\" and \"\\n\\nAssistant:\" turns?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #6",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#pricing\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors\"]",
|
|
"prompt": "How do the additional tokens required for tool use in Claude API requests impact pricing compared to regular API requests?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 997,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#pricing\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors\"]",
|
|
"prompt": "How do the additional tokens required for tool use in Claude API requests impact pricing compared to regular API requests?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#pricing\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors\"]",
|
|
"prompt": "How do the additional tokens required for tool use in Claude API requests impact pricing compared to regular API requests?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How do the additional tokens required for tool use in Claude API requests impact pricing compared to regular API requests?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#pricing\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #6"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#pricing\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\"]",
|
|
"How do the additional tokens required for tool use in Claude API requests impact pricing compared to regular API requests?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #7",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/release-notes/api#june-27th-2024\",\"https://docs.claude.com/en/release-notes/api#may-30th-2024\",\"https://docs.claude.com/en/api/rate-limits#about-our-limits\"]",
|
|
"prompt": "When will the new Anthropic Developer Console features that show API usage, billing details, and rate limits be available?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1030,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/release-notes/api#june-27th-2024\",\"https://docs.claude.com/en/release-notes/api#may-30th-2024\",\"https://docs.claude.com/en/api/rate-limits#about-our-limits\"]",
|
|
"prompt": "When will the new Anthropic Developer Console features that show API usage, billing details, and rate limits be available?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/release-notes/api#june-27th-2024\",\"https://docs.claude.com/en/release-notes/api#may-30th-2024\",\"https://docs.claude.com/en/api/rate-limits#about-our-limits\"]",
|
|
"prompt": "When will the new Anthropic Developer Console features that show API usage, billing details, and rate limits be available?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "When will the new Anthropic Developer Console features that show API usage, billing details, and rate limits be available?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#june-27th-2024\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #7"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/release-notes/api#june-27th-2024\"]",
|
|
"When will the new Anthropic Developer Console features that show API usage, billing details, and rate limits be available?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #8",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#why-not-let-claude-think\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-measure-latency\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#before-implementing-cot\"]",
|
|
"prompt": "When deciding whether to use chain-of-thought (CoT) for a task, what are two key factors to consider in order to strike the right balance between performance and latency?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 992,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#why-not-let-claude-think\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-measure-latency\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#before-implementing-cot\"]",
|
|
"prompt": "When deciding whether to use chain-of-thought (CoT) for a task, what are two key factors to consider in order to strike the right balance between performance and latency?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#why-not-let-claude-think\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-measure-latency\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#before-implementing-cot\"]",
|
|
"prompt": "When deciding whether to use chain-of-thought (CoT) for a task, what are two key factors to consider in order to strike the right balance between performance and latency?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "When deciding whether to use chain-of-thought (CoT) for a task, what are two key factors to consider in order to strike the right balance between performance and latency?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#why-not-let-claude-think\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#before-implementing-cot\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #8"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#why-not-let-claude-think\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#before-implementing-cot\"]",
|
|
"When deciding whether to use chain-of-thought (CoT) for a task, what are two key factors to consider in order to strike the right balance between performance and latency?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #9",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#controlling-claudes-output\",\"https://docs.claude.com/en/docs/build-with-claude/text-generation#text-capabilities-and-use-cases\",\"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook\"]",
|
|
"prompt": "How can I use Claude to more easily digest the content of long PDF documents?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1226,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/text-generation#text-capabilities-and-use-cases\",\"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook\",\"https://docs.claude.com/en/docs/intro-to-claude#what-you-can-do-with-claude\"]",
|
|
"prompt": "How can I use Claude to more easily digest the content of long PDF documents?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 928,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#controlling-claudes-output\",\"https://docs.claude.com/en/docs/build-with-claude/text-generation#text-capabilities-and-use-cases\",\"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook\"]",
|
|
"prompt": "How can I use Claude to more easily digest the content of long PDF documents?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can I use Claude to more easily digest the content of long PDF documents?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook\",\"https://docs.claude.com/en/docs/build-with-claude/vision#before-you-upload\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #9"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook\",\"https://docs.claude.com/en/docs/build-with-claude/vision#before-you-upload\"]",
|
|
"How can I use Claude to more easily digest the content of long PDF documents?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #10",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/rate-limits#about-our-limits\",\"https://docs.claude.com/en/api/rate-limits#response-headers\",\"https://docs.claude.com/en/release-notes/api#june-27th-2024\"]",
|
|
"prompt": "According to the documentation, where can you view your organization's current API rate limits in the Claude Console?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1543,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/rate-limits#response-headers\",\"https://docs.claude.com/en/api/rate-limits#about-our-limits\",\"https://docs.claude.com/en/release-notes/api#june-27th-2024\"]",
|
|
"prompt": "According to the documentation, where can you view your organization's current API rate limits in the Claude Console?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1520,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/release-notes/api#june-27th-2024\",\"https://docs.claude.com/en/api/rate-limits#about-our-limits\",\"https://docs.claude.com/en/api/rate-limits#rate-limits\"]",
|
|
"prompt": "According to the documentation, where can you view your organization's current API rate limits in the Claude Console?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 2784,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "According to the documentation, where can you view your organization's current API rate limits in the Claude Console?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/rate-limits#about-our-limits\",\"https://docs.claude.com/en/release-notes/api#june-27th-2024\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #10"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/rate-limits#about-our-limits\",\"https://docs.claude.com/en/release-notes/api#june-27th-2024\"]",
|
|
"According to the documentation, where can you view your organization's current API rate limits in the Claude Console?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #11",
|
|
"outputs": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#advantages-of-using-claude\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics\"]",
|
|
"prompt": "How can we measure the performance of the ticket classification system implemented using Claude beyond just accuracy?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1227,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#advantages-of-using-claude\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#deploy-your-classifier\"]",
|
|
"prompt": "How can we measure the performance of the ticket classification system implemented using Claude beyond just accuracy?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1105,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#advantages-of-using-claude\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics\"]",
|
|
"prompt": "How can we measure the performance of the ticket classification system implemented using Claude beyond just accuracy?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can we measure the performance of the ticket classification system implemented using Claude beyond just accuracy?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#prompting-claude-for-ticket-routing\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #11"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#prompting-claude-for-ticket-routing\"]",
|
|
"How can we measure the performance of the ticket classification system implemented using Claude beyond just accuracy?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #12",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs\"]",
|
|
"prompt": "How can you specify a system prompt using the Text Completions API versus the Messages API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1816,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs\"]",
|
|
"prompt": "How can you specify a system prompt using the Text Completions API versus the Messages API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1828,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format\"]",
|
|
"prompt": "How can you specify a system prompt using the Text Completions API versus the Messages API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3415,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can you specify a system prompt using the Text Completions API versus the Messages API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/prompt-validation#examples\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #12"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/prompt-validation#examples\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt\"]",
|
|
"How can you specify a system prompt using the Text Completions API versus the Messages API?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #13",
|
|
"outputs": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#why-use-xml-tags\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\"]",
|
|
"prompt": "How can you combine XML tags with chain of thought reasoning to create high-performance prompts for Claude?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1235,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#why-use-xml-tags\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\"]",
|
|
"prompt": "How can you combine XML tags with chain of thought reasoning to create high-performance prompts for Claude?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#why-use-xml-tags\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\"]",
|
|
"prompt": "How can you combine XML tags with chain of thought reasoning to create high-performance prompts for Claude?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can you combine XML tags with chain of thought reasoning to create high-performance prompts for Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#tagging-best-practices\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#chain-of-thought\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #13"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#tagging-best-practices\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#chain-of-thought\"]",
|
|
"How can you combine XML tags with chain of thought reasoning to create high-performance prompts for Claude?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #14",
|
|
"outputs": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#choosing-the-right-model\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification\"]",
|
|
"prompt": "When evaluating the Claude model's performance for ticket routing, what three key metrics are calculated and what are the results for the claude-3-haiku-20240307 model on the 91 test samples?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1304,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#choosing-the-right-model\"]",
|
|
"prompt": "When evaluating the Claude model's performance for ticket routing, what three key metrics are calculated and what are the results for the claude-3-haiku-20240307 model on the 91 test samples?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1287,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#choosing-the-right-model\"]",
|
|
"prompt": "When evaluating the Claude model's performance for ticket routing, what three key metrics are calculated and what are the results for the claude-3-haiku-20240307 model on the 91 test samples?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 4007,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "When evaluating the Claude model's performance for ticket routing, what three key metrics are calculated and what are the results for the claude-3-haiku-20240307 model on the 91 test samples?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#example-data\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #14"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#example-data\"]",
|
|
"When evaluating the Claude model's performance for ticket routing, what three key metrics are calculated and what are the results for the claude-3-haiku-20240307 model on the 91 test samples?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #15",
|
|
"outputs": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/intro-to-claude#implementing-claude\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\",\"https://docs.claude.com/en/docs/intro-to-claude#start-building-with-claude\"]",
|
|
"prompt": "Before starting to engineer and improve a prompt in Claude, what key things does Anthropic recommend you have in place first?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1196,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow\",\"https://docs.claude.com/en/docs/build-with-claude/vision#prompt-examples\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#before-prompt-engineering\"]",
|
|
"prompt": "Before starting to engineer and improve a prompt in Claude, what key things does Anthropic recommend you have in place first?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1234,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#before-prompt-engineering\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#iterating-your-prompt-for-better-performance\"]",
|
|
"prompt": "Before starting to engineer and improve a prompt in Claude, what key things does Anthropic recommend you have in place first?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3360,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "Before starting to engineer and improve a prompt in Claude, what key things does Anthropic recommend you have in place first?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/define-success#next-steps\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#before-prompt-engineering\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #15"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/define-success#next-steps\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#before-prompt-engineering\"]",
|
|
"Before starting to engineer and improve a prompt in Claude, what key things does Anthropic recommend you have in place first?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #16",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format\"]",
|
|
"prompt": "How does the Messages API handle mid-response prompting compared to the Text Completions API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1196,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth\"]",
|
|
"prompt": "How does the Messages API handle mid-response prompting compared to the Text Completions API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1181,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth\"]",
|
|
"prompt": "How does the Messages API handle mid-response prompting compared to the Text Completions API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How does the Messages API handle mid-response prompting compared to the Text Completions API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #16"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth\"]",
|
|
"How does the Messages API handle mid-response prompting compared to the Text Completions API?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #17",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#why-use-role-prompting\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-2-financial-analysis\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\"]",
|
|
"prompt": "How does Claude's response differ when given a role through a system prompt compared to not having a specific role in the financial analysis example?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1442,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#why-use-role-prompting\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-2-financial-analysis\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\"]",
|
|
"prompt": "How does Claude's response differ when given a role through a system prompt compared to not having a specific role in the financial analysis example?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1452,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-2-financial-analysis\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#how-to-give-claude-a-role\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#why-use-role-prompting\"]",
|
|
"prompt": "How does Claude's response differ when given a role through a system prompt compared to not having a specific role in the financial analysis example?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3789,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How does Claude's response differ when given a role through a system prompt compared to not having a specific role in the financial analysis example?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-2-financial-analysis\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #17"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-2-financial-analysis\"]",
|
|
"How does Claude's response differ when given a role through a system prompt compared to not having a specific role in the financial analysis example?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #18",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/define-success#common-success-criteria-to-consider\",\"https://docs.claude.com/en/docs/build-with-claude/define-success#building-strong-criteria\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics\"]",
|
|
"prompt": "What are some quantitative metrics that can be used to measure the success of a sentiment analysis model, and how might specific targets for those metrics be determined?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1256,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/define-success#common-success-criteria-to-consider\",\"https://docs.claude.com/en/docs/build-with-claude/define-success#building-strong-criteria\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier\"]",
|
|
"prompt": "What are some quantitative metrics that can be used to measure the success of a sentiment analysis model, and how might specific targets for those metrics be determined?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 5842,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/define-success#building-strong-criteria\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics\",\"https://docs.claude.com/en/docs/build-with-claude/define-success#common-success-criteria-to-consider\"]",
|
|
"prompt": "What are some quantitative metrics that can be used to measure the success of a sentiment analysis model, and how might specific targets for those metrics be determined?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 8264,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are some quantitative metrics that can be used to measure the success of a sentiment analysis model, and how might specific targets for those metrics be determined?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/define-success#building-strong-criteria\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #18"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/define-success#building-strong-criteria\"]",
|
|
"What are some quantitative metrics that can be used to measure the success of a sentiment analysis model, and how might specific targets for those metrics be determined?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #19",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#tagging-best-practices\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#why-use-xml-tags\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\"]",
|
|
"prompt": "What is a power user tip mentioned in the documentation for creating high-performance prompts using XML tags?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1011,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#tagging-best-practices\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#why-use-xml-tags\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\"]",
|
|
"prompt": "What is a power user tip mentioned in the documentation for creating high-performance prompts using XML tags?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#tagging-best-practices\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#why-use-xml-tags\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\"]",
|
|
"prompt": "What is a power user tip mentioned in the documentation for creating high-performance prompts using XML tags?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What is a power user tip mentioned in the documentation for creating high-performance prompts using XML tags?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#tagging-best-practices\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #19"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#tagging-best-practices\"]",
|
|
"What is a power user tip mentioned in the documentation for creating high-performance prompts using XML tags?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #20",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#when-to-use-claude-for-classification\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#grading-evals\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#tips-for-llm-based-grading\"]",
|
|
"prompt": "How can you use an LLM like Claude to automatically grade the outputs of other LLMs based on a rubric?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1277,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#tips-for-llm-based-grading\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#grading-evals\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#when-to-use-claude-for-classification\"]",
|
|
"prompt": "How can you use an LLM like Claude to automatically grade the outputs of other LLMs based on a rubric?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1237,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#when-to-use-claude-for-classification\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#grading-evals\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#tips-for-llm-based-grading\"]",
|
|
"prompt": "How can you use an LLM like Claude to automatically grade the outputs of other LLMs based on a rubric?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can you use an LLM like Claude to automatically grade the outputs of other LLMs based on a rubric?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#tips-for-llm-based-grading\",\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #20"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#tips-for-llm-based-grading\",\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\"]",
|
|
"How can you use an LLM like Claude to automatically grade the outputs of other LLMs based on a rubric?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #21",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-on-the-aws-marketplace\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#getting-started-with-voyage-ai\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#pricing\"]",
|
|
"prompt": "How can you access and deploy Voyage embeddings on AWS Marketplace?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1145,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-on-the-aws-marketplace\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#getting-started-with-voyage-ai\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#pricing\"]",
|
|
"prompt": "How can you access and deploy Voyage embeddings on AWS Marketplace?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1087,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-on-the-aws-marketplace\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#getting-started-with-voyage-ai\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#pricing\"]",
|
|
"prompt": "How can you access and deploy Voyage embeddings on AWS Marketplace?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can you access and deploy Voyage embeddings on AWS Marketplace?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-on-the-aws-marketplace\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #21"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-on-the-aws-marketplace\"]",
|
|
"How can you access and deploy Voyage embeddings on AWS Marketplace?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #22",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#controlling-claudes-output\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#next-steps\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#json-output\"]",
|
|
"prompt": "When using tools just to get Claude to produce JSON output following a particular schema, what key things should you do in terms of tool setup and prompting?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1468,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#json-output\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#next-steps\"]",
|
|
"prompt": "When using tools just to get Claude to produce JSON output following a particular schema, what key things should you do in terms of tool setup and prompting?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1596,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#json-output\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#chain-of-thought\"]",
|
|
"prompt": "When using tools just to get Claude to produce JSON output following a particular schema, what key things should you do in terms of tool setup and prompting?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3757,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "When using tools just to get Claude to produce JSON output following a particular schema, what key things should you do in terms of tool setup and prompting?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#tool-use-examples\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#json-output\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #22"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#tool-use-examples\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#json-output\"]",
|
|
"When using tools just to get Claude to produce JSON output following a particular schema, what key things should you do in terms of tool setup and prompting?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #23",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/models#legacy-models\",\"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification\"]",
|
|
"prompt": "What are the key differences between the legacy Claude Instant 1.2 model and the Claude 3 Haiku model in terms of capabilities and performance?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1431,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 0.6666666666666666 \n F1 Score: 0.6666666666666666 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/models#legacy-models\",\"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison\",\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\"]",
|
|
"prompt": "What are the key differences between the legacy Claude Instant 1.2 model and the Claude 3 Haiku model in terms of capabilities and performance?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1263,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/models#legacy-models\",\"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison\",\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\"]",
|
|
"prompt": "What are the key differences between the legacy Claude Instant 1.2 model and the Claude 3 Haiku model in terms of capabilities and performance?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are the key differences between the legacy Claude Instant 1.2 model and the Claude 3 Haiku model in terms of capabilities and performance?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison\",\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\",\"https://docs.claude.com/en/docs/about-claude/models#legacy-models\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #23"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison\",\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\",\"https://docs.claude.com/en/docs/about-claude/models#legacy-models\"]",
|
|
"What are the key differences between the legacy Claude Instant 1.2 model and the Claude 3 Haiku model in terms of capabilities and performance?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #24",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\"]",
|
|
"prompt": "What is one key benefit of using examples when prompt engineering with Claude?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1185,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow\"]",
|
|
"prompt": "What is one key benefit of using examples when prompt engineering with Claude?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 5750,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#crafting-effective-examples\"]",
|
|
"prompt": "What is one key benefit of using examples when prompt engineering with Claude?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3331,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What is one key benefit of using examples when prompt engineering with Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #24"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples\"]",
|
|
"What is one key benefit of using examples when prompt engineering with Claude?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #25",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#before-prompt-engineering\"]",
|
|
"prompt": "According to the Claude Documentation, what is one key advantage of using prompt engineering instead of fine-tuning when it comes to adapting an AI model to new domains or tasks?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1107,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak\"]",
|
|
"prompt": "According to the Claude Documentation, what is one key advantage of using prompt engineering instead of fine-tuning when it comes to adapting an AI model to new domains or tasks?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1083,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak\"]",
|
|
"prompt": "According to the Claude Documentation, what is one key advantage of using prompt engineering instead of fine-tuning when it comes to adapting an AI model to new domains or tasks?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "According to the Claude Documentation, what is one key advantage of using prompt engineering instead of fine-tuning when it comes to adapting an AI model to new domains or tasks?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #25"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\"]",
|
|
"According to the Claude Documentation, what is one key advantage of using prompt engineering instead of fine-tuning when it comes to adapting an AI model to new domains or tasks?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #26",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#get-started-with-claude-for-sheets\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#claude-for-sheets-usage-examples\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#claude-for-sheets-workbook-template\"]",
|
|
"prompt": "How can I quickly get started using the Claude for Sheets extension with a pre-made template?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1334,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#claude-for-sheets-workbook-template\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#get-started-with-claude-for-sheets\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#claude-for-sheets-usage-examples\"]",
|
|
"prompt": "How can I quickly get started using the Claude for Sheets extension with a pre-made template?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1310,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#claude-for-sheets-workbook-template\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#get-started-with-claude-for-sheets\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial\"]",
|
|
"prompt": "How can I quickly get started using the Claude for Sheets extension with a pre-made template?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3557,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can I quickly get started using the Claude for Sheets extension with a pre-made template?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#claude-for-sheets-workbook-template\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#get-started-with-claude-for-sheets\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #26"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#claude-for-sheets-workbook-template\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#get-started-with-claude-for-sheets\"]",
|
|
"How can I quickly get started using the Claude for Sheets extension with a pre-made template?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #27",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#delta-types\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#event-types\"]",
|
|
"prompt": "How does the \"index\" field in the \"content_block_delta\" event relate to the text being streamed in a response?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1252,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#delta-types\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#event-types\"]",
|
|
"prompt": "How does the \"index\" field in the \"content_block_delta\" event relate to the text being streamed in a response?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1264,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#delta-types\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#event-types\"]",
|
|
"prompt": "How does the \"index\" field in the \"content_block_delta\" event relate to the text being streamed in a response?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How does the \"index\" field in the \"content_block_delta\" event relate to the text being streamed in a response?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#basic-streaming-request\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #27"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/messages-streaming#basic-streaming-request\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]",
|
|
"How does the \"index\" field in the \"content_block_delta\" event relate to the text being streamed in a response?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #28",
|
|
"outputs": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/build-with-claude/vision#ensuring-image-quality\",\"https://docs.claude.com/en/docs/build-with-claude/vision#faq\",\"https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size\"]",
|
|
"prompt": "How can you include an image as part of a Claude API request, and what image formats are currently supported?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1472,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#ensuring-image-quality\",\"https://docs.claude.com/en/docs/build-with-claude/vision#faq\",\"https://docs.claude.com/en/api/messages-examples#vision\"]",
|
|
"prompt": "How can you include an image as part of a Claude API request, and what image formats are currently supported?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1595,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-examples#vision\",\"https://docs.claude.com/en/docs/build-with-claude/vision#faq\",\"https://docs.claude.com/en/docs/build-with-claude/vision#how-to-use-vision\"]",
|
|
"prompt": "How can you include an image as part of a Claude API request, and what image formats are currently supported?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3095,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can you include an image as part of a Claude API request, and what image formats are currently supported?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#vision\",\"https://docs.claude.com/en/docs/build-with-claude/vision#about-the-prompt-examples\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #28"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/messages-examples#vision\",\"https://docs.claude.com/en/docs/build-with-claude/vision#about-the-prompt-examples\"]",
|
|
"How can you include an image as part of a Claude API request, and what image formats are currently supported?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #29",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/resources/glossary#ttft-time-to-first-token\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-measure-latency\",\"https://docs.claude.com/en/docs/resources/glossary#latency\"]",
|
|
"prompt": "What is the relationship between time to first token (TTFT) and latency when evaluating a language model's performance?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1445,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/resources/glossary#ttft-time-to-first-token\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-measure-latency\",\"https://docs.claude.com/en/docs/resources/glossary#latency\"]",
|
|
"prompt": "What is the relationship between time to first token (TTFT) and latency when evaluating a language model's performance?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1394,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/resources/glossary#ttft-time-to-first-token\",\"https://docs.claude.com/en/docs/resources/glossary#latency\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-measure-latency\"]",
|
|
"prompt": "What is the relationship between time to first token (TTFT) and latency when evaluating a language model's performance?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3244,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What is the relationship between time to first token (TTFT) and latency when evaluating a language model's performance?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#ttft-time-to-first-token\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-measure-latency\",\"https://docs.claude.com/en/docs/resources/glossary#latency\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #29"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/resources/glossary#ttft-time-to-first-token\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-measure-latency\",\"https://docs.claude.com/en/docs/resources/glossary#latency\"]",
|
|
"What is the relationship between time to first token (TTFT) and latency when evaluating a language model's performance?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #30",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#adapting-to-common-scenarios\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#introduction\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations\"]",
|
|
"prompt": "How can providing Claude with examples of handling certain edge cases like implicit requests or emotional prioritization help improve its performance in routing support tickets?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1397,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#adapting-to-common-scenarios\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#introduction\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations\"]",
|
|
"prompt": "How can providing Claude with examples of handling certain edge cases like implicit requests or emotional prioritization help improve its performance in routing support tickets?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1399,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#adapting-to-common-scenarios\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#advantages-of-using-claude\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology\"]",
|
|
"prompt": "How can providing Claude with examples of handling certain edge cases like implicit requests or emotional prioritization help improve its performance in routing support tickets?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 4102,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can providing Claude with examples of handling certain edge cases like implicit requests or emotional prioritization help improve its performance in routing support tickets?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#adapting-to-common-scenarios\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#prompting-claude-for-ticket-routing\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #30"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#adapting-to-common-scenarios\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#prompting-claude-for-ticket-routing\"]",
|
|
"How can providing Claude with examples of handling certain edge cases like implicit requests or emotional prioritization help improve its performance in routing support tickets?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #31",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#next-steps\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#chain-of-thought\"]",
|
|
"prompt": "How does the stop_reason of \"tool_use\" relate to the overall workflow of integrating external tools with Claude?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1507,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#pricing\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#next-steps\"]",
|
|
"prompt": "How does the stop_reason of \"tool_use\" relate to the overall workflow of integrating external tools with Claude?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1468,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#chain-of-thought\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#forcing-tool-use\"]",
|
|
"prompt": "How does the stop_reason of \"tool_use\" relate to the overall workflow of integrating external tools with Claude?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3300,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How does the stop_reason of \"tool_use\" relate to the overall workflow of integrating external tools with Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#tool-use-and-json-mode\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #31"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/messages-examples#tool-use-and-json-mode\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\"]",
|
|
"How does the stop_reason of \"tool_use\" relate to the overall workflow of integrating external tools with Claude?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #32",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/errors#http-errors\",\"https://docs.claude.com/en/api/messages-streaming#error-events\",\"https://docs.claude.com/en/api/streaming#error-event-types\"]",
|
|
"prompt": "According to the documentation, what error event and corresponding HTTP error code may be sent during periods of high usage for the Claude API when using streaming responses?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1216,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/errors#http-errors\",\"https://docs.claude.com/en/api/messages-streaming#error-events\",\"https://docs.claude.com/en/api/streaming#error-event-types\"]",
|
|
"prompt": "According to the documentation, what error event and corresponding HTTP error code may be sent during periods of high usage for the Claude API when using streaming responses?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1217,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/errors#http-errors\",\"https://docs.claude.com/en/api/messages-streaming#error-events\",\"https://docs.claude.com/en/api/streaming#error-event-types\"]",
|
|
"prompt": "According to the documentation, what error event and corresponding HTTP error code may be sent during periods of high usage for the Claude API when using streaming responses?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "According to the documentation, what error event and corresponding HTTP error code may be sent during periods of high usage for the Claude API when using streaming responses?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#error-events\",\"https://docs.claude.com/en/api/streaming#error-event-types\",\"https://docs.claude.com/en/api/errors#http-errors\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #32"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/messages-streaming#error-events\",\"https://docs.claude.com/en/api/streaming#error-event-types\",\"https://docs.claude.com/en/api/errors#http-errors\"]",
|
|
"According to the documentation, what error event and corresponding HTTP error code may be sent during periods of high usage for the Claude API when using streaming responses?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #33",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#delta-types\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\"]",
|
|
"prompt": "What are the two types of deltas that can be contained in a content_block_delta event when streaming responses from the Claude API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1406,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#delta-types\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\"]",
|
|
"prompt": "What are the two types of deltas that can be contained in a content_block_delta event when streaming responses from the Claude API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1881,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#delta-types\"]",
|
|
"prompt": "What are the two types of deltas that can be contained in a content_block_delta event when streaming responses from the Claude API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 8176,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are the two types of deltas that can be contained in a content_block_delta event when streaming responses from the Claude API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#delta-types\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #33"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#delta-types\"]",
|
|
"What are the two types of deltas that can be contained in a content_block_delta event when streaming responses from the Claude API?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #34",
|
|
"outputs": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Error: Error running Python script: _pickle.UnpicklingError: pickle data was truncated\nStack Trace: Error: _pickle.UnpicklingError: pickle data was truncated\n at PythonShell.parseError (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/python-shell/index.js:303:21)\n at terminateIfNeeded (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/python-shell/index.js:193:32)\n at ChildProcess.<anonymous> (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/python-shell/index.js:185:13)\n at ChildProcess.emit (node:events:519:28)\n at ChildProcess._handle.onexit (node:internal/child_process:294:12)\n --Python Traceback: --\n File \"/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/python/wrapper.py\", line 34, in <module>\n result = call_method(script_path, method_name, *data)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/python/wrapper.py\", line 18, in call_method\n spec.loader.exec_module(script_module)\n File \"<frozen importlib._bootstrap_external>\", line 940, in exec_module\n File \"<frozen importlib._bootstrap>\", line 241, in _call_with_frames_removed\n File \"/Users/sflamini/code/anthropic-cookbook/skills/retrieval_augmented_generation/evaluation/provider_retrieval.py\", line 114, in <module>\n db_rerank.load_data(anthropic_docs_summaries)\n File \"/Users/sflamini/code/anthropic-cookbook/skills/retrieval_augmented_generation/evaluation/vectordb.py\", line 108, in load_data\n self.load_db()\n File \"/Users/sflamini/code/anthropic-cookbook/skills/retrieval_augmented_generation/evaluation/vectordb.py\", line 169, in load_db\n data = pickle.load(file)\n ^^^^^^^^^^^^^^^^^\n\nError: Error running Python script: _pickle.UnpicklingError: pickle data was truncated\nStack Trace: Error: _pickle.UnpicklingError: pickle data was truncated\n at PythonShell.parseError (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/python-shell/index.js:303:21)\n at terminateIfNeeded (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/python-shell/index.js:193:32)\n at ChildProcess.<anonymous> (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/python-shell/index.js:185:13)\n at ChildProcess.emit (node:events:519:28)\n at ChildProcess._handle.onexit (node:internal/child_process:294:12)\n --Python Traceback: --\n File \"/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/python/wrapper.py\", line 34, in <module>\n result = call_method(script_path, method_name, *data)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/python/wrapper.py\", line 18, in call_method\n spec.loader.exec_module(script_module)\n File \"<frozen importlib._bootstrap_external>\", line 940, in exec_module\n File \"<frozen importlib._bootstrap>\", line 241, in _call_with_frames_removed\n File \"/Users/sflamini/code/anthropic-cookbook/skills/retrieval_augmented_generation/evaluation/provider_retrieval.py\", line 114, in <module>\n db_rerank.load_data(anthropic_docs_summaries)\n File \"/Users/sflamini/code/anthropic-cookbook/skills/retrieval_augmented_generation/evaluation/vectordb.py\", line 108, in load_data\n self.load_db()\n File \"/Users/sflamini/code/anthropic-cookbook/skills/retrieval_augmented_generation/evaluation/vectordb.py\", line 169, in load_db\n data = pickle.load(file)\n ^^^^^^^^^^^^^^^^^\n at runPython (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/python/wrapper.js:50:15)\n at process.processTicksAndRejections (node:internal/process/task_queues:95:5)\n at async PythonProvider.executePythonScript (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/providers/pythonCompletion.js:52:31)\n at async Evaluator.runEval (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/evaluator.js:297:28)\n at async processEvalStep (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/evaluator.js:619:25)\n---\nError: Error running Python script: _pickle.UnpicklingError: pickle data was truncated\nStack Trace: Error: _pickle.UnpicklingError: pickle data was truncated\n at PythonShell.parseError (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/python-shell/index.js:303:21)\n at terminateIfNeeded (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/python-shell/index.js:193:32)\n at ChildProcess.<anonymous> (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/python-shell/index.js:185:13)\n at ChildProcess.emit (node:events:519:28)\n at ChildProcess._handle.onexit (node:internal/child_process:294:12)\n --Python Traceback: --\n File \"/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/python/wrapper.py\", line 34, in <module>\n result = call_method(script_path, method_name, *data)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/python/wrapper.py\", line 18, in call_method\n spec.loader.exec_module(script_module)\n File \"<frozen importlib._bootstrap_external>\", line 940, in exec_module\n File \"<frozen importlib._bootstrap>\", line 241, in _call_with_frames_removed\n File \"/Users/sflamini/code/anthropic-cookbook/skills/retrieval_augmented_generation/evaluation/provider_retrieval.py\", line 114, in <module>\n db_rerank.load_data(anthropic_docs_summaries)\n File \"/Users/sflamini/code/anthropic-cookbook/skills/retrieval_augmented_generation/evaluation/vectordb.py\", line 108, in load_data\n self.load_db()\n File \"/Users/sflamini/code/anthropic-cookbook/skills/retrieval_augmented_generation/evaluation/vectordb.py\", line 169, in load_db\n data = pickle.load(file)\n ^^^^^^^^^^^^^^^^^\n\nError: Error running Python script: _pickle.UnpicklingError: pickle data was truncated\nStack Trace: Error: _pickle.UnpicklingError: pickle data was truncated\n at PythonShell.parseError (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/python-shell/index.js:303:21)\n at terminateIfNeeded (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/python-shell/index.js:193:32)\n at ChildProcess.<anonymous> (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/python-shell/index.js:185:13)\n at ChildProcess.emit (node:events:519:28)\n at ChildProcess._handle.onexit (node:internal/child_process:294:12)\n --Python Traceback: --\n File \"/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/python/wrapper.py\", line 34, in <module>\n result = call_method(script_path, method_name, *data)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/python/wrapper.py\", line 18, in call_method\n spec.loader.exec_module(script_module)\n File \"<frozen importlib._bootstrap_external>\", line 940, in exec_module\n File \"<frozen importlib._bootstrap>\", line 241, in _call_with_frames_removed\n File \"/Users/sflamini/code/anthropic-cookbook/skills/retrieval_augmented_generation/evaluation/provider_retrieval.py\", line 114, in <module>\n db_rerank.load_data(anthropic_docs_summaries)\n File \"/Users/sflamini/code/anthropic-cookbook/skills/retrieval_augmented_generation/evaluation/vectordb.py\", line 108, in load_data\n self.load_db()\n File \"/Users/sflamini/code/anthropic-cookbook/skills/retrieval_augmented_generation/evaluation/vectordb.py\", line 169, in load_db\n data = pickle.load(file)\n ^^^^^^^^^^^^^^^^^\n at runPython (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/python/wrapper.js:50:15)\n at process.processTicksAndRejections (node:internal/process/task_queues:95:5)\n at async PythonProvider.executePythonScript (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/providers/pythonCompletion.js:52:31)\n at async Evaluator.runEval (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/evaluator.js:297:28)\n at async processEvalStep (/Users/sflamini/.npm/_npx/81bbc6515d992ace/node_modules/promptfoo/dist/src/evaluator.js:619:25)",
|
|
"prompt": "On what date did Claude 3.5 Sonnet and tool use both become generally available across the Claude API, Amazon Bedrock, and Google Vertex AI?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 0,
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/release-notes/api#june-20th-2024\",\"https://docs.claude.com/en/release-notes/api#may-30th-2024\",\"https://docs.claude.com/en/docs/intro-to-claude#claude-3-5-family\"]",
|
|
"prompt": "On what date did Claude 3.5 Sonnet and tool use both become generally available across the Claude API, Amazon Bedrock, and Google Vertex AI?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1483,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/release-notes/api#june-20th-2024\",\"https://docs.claude.com/en/release-notes/api#may-30th-2024\",\"https://docs.claude.com/en/docs/about-claude/models#model-names\"]",
|
|
"prompt": "On what date did Claude 3.5 Sonnet and tool use both become generally available across the Claude API, Amazon Bedrock, and Google Vertex AI?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 2978,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "On what date did Claude 3.5 Sonnet and tool use both become generally available across the Claude API, Amazon Bedrock, and Google Vertex AI?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#june-20th-2024\",\"https://docs.claude.com/en/release-notes/api#may-30th-2024\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #34"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/release-notes/api#june-20th-2024\",\"https://docs.claude.com/en/release-notes/api#may-30th-2024\"]",
|
|
"On what date did Claude 3.5 Sonnet and tool use both become generally available across the Claude API, Amazon Bedrock, and Google Vertex AI?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #35",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024\",\"https://docs.claude.com/en/docs/intro-to-claude#start-building-with-claude\"]",
|
|
"prompt": "In what order did Anthropic launch Claude.ai and the Claude iOS app in Canada and Europe?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1169,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024\",\"https://docs.claude.com/en/api/claude-on-vertex-ai#model-availability\"]",
|
|
"prompt": "In what order did Anthropic launch Claude.ai and the Claude iOS app in Canada and Europe?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1252,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024\",\"https://docs.claude.com/en/docs/intro-to-claude#start-building-with-claude\"]",
|
|
"prompt": "In what order did Anthropic launch Claude.ai and the Claude iOS app in Canada and Europe?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 14,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "In what order did Anthropic launch Claude.ai and the Claude iOS app in Canada and Europe?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #35"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024\"]",
|
|
"In what order did Anthropic launch Claude.ai and the Claude iOS app in Canada and Europe?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #36",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#forcing-tool-use\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors\"]",
|
|
"prompt": "When the API response from Claude has a stop_reason of \"tool_use\", what does this indicate and what should be done next to continue the conversation?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 925,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#forcing-tool-use\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors\"]",
|
|
"prompt": "When the API response from Claude has a stop_reason of \"tool_use\", what does this indicate and what should be done next to continue the conversation?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1459,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#forcing-tool-use\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors\"]",
|
|
"prompt": "When the API response from Claude has a stop_reason of \"tool_use\", what does this indicate and what should be done next to continue the conversation?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 2,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "When the API response from Claude has a stop_reason of \"tool_use\", what does this indicate and what should be done next to continue the conversation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#json-output\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #36"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#json-output\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\"]",
|
|
"When the API response from Claude has a stop_reason of \"tool_use\", what does this indicate and what should be done next to continue the conversation?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #37",
|
|
"outputs": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/api/client-sdks#python\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-python-package\",\"https://docs.claude.com/en/api/prompt-validation#examples\"]",
|
|
"prompt": "What Python libraries are used in the example code snippet for evaluating tone and style in a customer service chatbot?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1183,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/api/client-sdks#python\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-python-package\",\"https://docs.claude.com/en/api/prompt-validation#examples\"]",
|
|
"prompt": "What Python libraries are used in the example code snippet for evaluating tone and style in a customer service chatbot?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1431,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/api/client-sdks#python\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-python-package\",\"https://docs.claude.com/en/api/prompt-validation#examples\"]",
|
|
"prompt": "What Python libraries are used in the example code snippet for evaluating tone and style in a customer service chatbot?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What Python libraries are used in the example code snippet for evaluating tone and style in a customer service chatbot?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#example-evals\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #37"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#example-evals\"]",
|
|
"What Python libraries are used in the example code snippet for evaluating tone and style in a customer service chatbot?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #38",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#install-an-sdk-for-accessing-bedrock\",\"https://docs.claude.com/en/api/claude-on-vertex-ai#making-requests\"]",
|
|
"prompt": "What are the two main ways to authenticate when using the Anthropic Python SDK to access Claude models on Amazon Bedrock?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1543,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#subscribe-to-anthropic-models\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#accessing-bedrock\"]",
|
|
"prompt": "What are the two main ways to authenticate when using the Anthropic Python SDK to access Claude models on Amazon Bedrock?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1577,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#install-an-sdk-for-accessing-bedrock\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#api-model-names\"]",
|
|
"prompt": "What are the two main ways to authenticate when using the Anthropic Python SDK to access Claude models on Amazon Bedrock?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 8341,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are the two main ways to authenticate when using the Anthropic Python SDK to access Claude models on Amazon Bedrock?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#install-an-sdk-for-accessing-bedrock\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #38"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#install-an-sdk-for-accessing-bedrock\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests\"]",
|
|
"What are the two main ways to authenticate when using the Anthropic Python SDK to access Claude models on Amazon Bedrock?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #39",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#strategies-to-reduce-prompt-leak\"]",
|
|
"prompt": "When deciding whether to implement leak-resistant prompt engineering strategies, what two factors should be considered and balanced?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1337,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#strategies-to-reduce-prompt-leak\"]",
|
|
"prompt": "When deciding whether to implement leak-resistant prompt engineering strategies, what two factors should be considered and balanced?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1365,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#strategies-to-reduce-prompt-leak\"]",
|
|
"prompt": "When deciding whether to implement leak-resistant prompt engineering strategies, what two factors should be considered and balanced?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 2928,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "When deciding whether to implement leak-resistant prompt engineering strategies, what two factors should be considered and balanced?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#strategies-to-reduce-prompt-leak\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #39"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#strategies-to-reduce-prompt-leak\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak\"]",
|
|
"When deciding whether to implement leak-resistant prompt engineering strategies, what two factors should be considered and balanced?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #40",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification\",\"https://docs.claude.com/en/docs/intro-to-claude#model-options\"]",
|
|
"prompt": "How can selecting the appropriate Claude model based on your specific requirements help reduce latency in your application?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1379,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\",\"https://docs.claude.com/en/docs/intro-to-claude#model-options\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-reduce-latency\"]",
|
|
"prompt": "How can selecting the appropriate Claude model based on your specific requirements help reduce latency in your application?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1372,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\",\"https://docs.claude.com/en/docs/intro-to-claude#model-options\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-reduce-latency\"]",
|
|
"prompt": "How can selecting the appropriate Claude model based on your specific requirements help reduce latency in your application?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can selecting the appropriate Claude model based on your specific requirements help reduce latency in your application?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\",\"https://docs.claude.com/en/docs/intro-to-claude#model-options\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #40"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\",\"https://docs.claude.com/en/docs/intro-to-claude#model-options\"]",
|
|
"How can selecting the appropriate Claude model based on your specific requirements help reduce latency in your application?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #41",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#streaming-with-sdks\",\"https://docs.claude.com/en/docs/quickstart#prerequisites\",\"https://docs.claude.com/en/api/messages-streaming#basic-streaming-request\"]",
|
|
"prompt": "How can you stream responses from the Claude API using the Python SDK?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1396,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#streaming-with-sdks\",\"https://docs.claude.com/en/api/messages-streaming#basic-streaming-request\",\"https://docs.claude.com/en/api/#authentication\"]",
|
|
"prompt": "How can you stream responses from the Claude API using the Python SDK?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1297,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#streaming-with-sdks\",\"https://docs.claude.com/en/api/client-sdks#python\",\"https://docs.claude.com/en/api/messages-streaming#basic-streaming-request\"]",
|
|
"prompt": "How can you stream responses from the Claude API using the Python SDK?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3824,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can you stream responses from the Claude API using the Python SDK?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#streaming-with-sdks\",\"https://docs.claude.com/en/api/client-sdks#python\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #41"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/messages-streaming#streaming-with-sdks\",\"https://docs.claude.com/en/api/client-sdks#python\"]",
|
|
"How can you stream responses from the Claude API using the Python SDK?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #42",
|
|
"outputs": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#how-to-prefill-claudes-response\",\"https://docs.claude.com/en/docs/welcome#get-started\",\"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources\"]",
|
|
"prompt": "How can you guide Claude's response by pre-filling part of the response, and what API parameter is used to generate a short response in this case?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1265,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#how-to-prefill-claudes-response\",\"https://docs.claude.com/en/docs/welcome#get-started\",\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\"]",
|
|
"prompt": "How can you guide Claude's response by pre-filling part of the response, and what API parameter is used to generate a short response in this case?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1193,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#how-to-prefill-claudes-response\",\"https://docs.claude.com/en/docs/welcome#get-started\",\"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources\"]",
|
|
"prompt": "How can you guide Claude's response by pre-filling part of the response, and what API parameter is used to generate a short response in this case?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can you guide Claude's response by pre-filling part of the response, and what API parameter is used to generate a short response in this case?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\",\"https://docs.claude.com/en/api/messages-examples#basic-request-and-response\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #42"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\",\"https://docs.claude.com/en/api/messages-examples#basic-request-and-response\"]",
|
|
"How can you guide Claude's response by pre-filling part of the response, and what API parameter is used to generate a short response in this case?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #43",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#eval-design-principles\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#grading-evals\",\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\"]",
|
|
"prompt": "What is more important when building an eval set for an AI system - having a larger number of test cases with automated grading, or having fewer high-quality test cases graded by humans?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1322,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#eval-design-principles\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#grading-evals\",\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\"]",
|
|
"prompt": "What is more important when building an eval set for an AI system - having a larger number of test cases with automated grading, or having fewer high-quality test cases graded by humans?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1280,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#eval-design-principles\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#grading-evals\",\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\"]",
|
|
"prompt": "What is more important when building an eval set for an AI system - having a larger number of test cases with automated grading, or having fewer high-quality test cases graded by humans?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What is more important when building an eval set for an AI system - having a larger number of test cases with automated grading, or having fewer high-quality test cases graded by humans?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#eval-design-principles\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#building-evals-and-test-cases\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #43"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#eval-design-principles\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#building-evals-and-test-cases\"]",
|
|
"What is more important when building an eval set for an AI system - having a larger number of test cases with automated grading, or having fewer high-quality test cases graded by humans?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #44",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#delta-types\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\"]",
|
|
"prompt": "What are the two required fields in a content_block_delta event for a text delta type?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1478,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#delta-types\",\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\"]",
|
|
"prompt": "What are the two required fields in a content_block_delta event for a text delta type?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1507,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#delta-types\",\"https://docs.claude.com/en/api/messages-streaming#event-types\"]",
|
|
"prompt": "What are the two required fields in a content_block_delta event for a text delta type?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 2091,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are the two required fields in a content_block_delta event for a text delta type?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#delta-types\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #44"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/messages-streaming#delta-types\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]",
|
|
"What are the two required fields in a content_block_delta event for a text delta type?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #45",
|
|
"outputs": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook\",\"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources\",\"https://docs.claude.com/en/docs/about-claude/models#get-started-with-claude\"]",
|
|
"prompt": "What are two interactive ways to learn how to use Claude's capabilities, such as uploading PDFs and generating embeddings?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1487,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#claude-for-sheets-usage-examples\",\"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#further-information\"]",
|
|
"prompt": "What are two interactive ways to learn how to use Claude's capabilities, such as uploading PDFs and generating embeddings?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1628,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook\",\"https://docs.claude.com/en/docs/welcome#develop-with-claude\",\"https://docs.claude.com/en/docs/welcome#get-started\"]",
|
|
"prompt": "What are two interactive ways to learn how to use Claude's capabilities, such as uploading PDFs and generating embeddings?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3638,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are two interactive ways to learn how to use Claude's capabilities, such as uploading PDFs and generating embeddings?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/quickstart#next-steps\",\"https://docs.claude.com/en/docs/welcome#develop-with-claude\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #45"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/quickstart#next-steps\",\"https://docs.claude.com/en/docs/welcome#develop-with-claude\"]",
|
|
"What are two interactive ways to learn how to use Claude's capabilities, such as uploading PDFs and generating embeddings?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #46",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#why-let-claude-think\"]",
|
|
"prompt": "Why does breaking a task into distinct subtasks for chained prompts help improve Claude's accuracy on the overall task?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1533,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#why-let-claude-think\"]",
|
|
"prompt": "Why does breaking a task into distinct subtasks for chained prompts help improve Claude's accuracy on the overall task?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1486,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#chain-prompts-for-complex-tasks\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#when-to-chain-prompts\"]",
|
|
"prompt": "Why does breaking a task into distinct subtasks for chained prompts help improve Claude's accuracy on the overall task?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3546,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "Why does breaking a task into distinct subtasks for chained prompts help improve Claude's accuracy on the overall task?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #46"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts\"]",
|
|
"Why does breaking a task into distinct subtasks for chained prompts help improve Claude's accuracy on the overall task?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #47",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#3-leverage-streaming\"]",
|
|
"prompt": "How does the streaming format for Messages responses differ from Text Completions streaming responses?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1353,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#3-leverage-streaming\"]",
|
|
"prompt": "How does the streaming format for Messages responses differ from Text Completions streaming responses?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1248,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#3-leverage-streaming\"]",
|
|
"prompt": "How does the streaming format for Messages responses differ from Text Completions streaming responses?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How does the streaming format for Messages responses differ from Text Completions streaming responses?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #47"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format\"]",
|
|
"How does the streaming format for Messages responses differ from Text Completions streaming responses?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #48",
|
|
"outputs": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/intro-to-claude#start-building-with-claude\",\"https://docs.claude.com/en/docs/welcome#develop-with-claude\",\"https://docs.claude.com/en/docs/welcome#get-started\"]",
|
|
"prompt": "What are two ways to start experimenting with Claude as a user, according to Anthropic's documentation?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1179,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/welcome#get-started\",\"https://docs.claude.com/en/docs/about-claude/models#get-started-with-claude\",\"https://docs.claude.com/en/docs/welcome#develop-with-claude\"]",
|
|
"prompt": "What are two ways to start experimenting with Claude as a user, according to Anthropic's documentation?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1251,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/models#get-started-with-claude\",\"https://docs.claude.com/en/docs/welcome#get-started\",\"https://docs.claude.com/en/docs/intro-to-claude#start-building-with-claude\"]",
|
|
"prompt": "What are two ways to start experimenting with Claude as a user, according to Anthropic's documentation?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 9837,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are two ways to start experimenting with Claude as a user, according to Anthropic's documentation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/models#get-started-with-claude\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #48"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/about-claude/models#get-started-with-claude\"]",
|
|
"What are two ways to start experimenting with Claude as a user, according to Anthropic's documentation?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #49",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#when-to-chain-prompts\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#chain-prompts-for-complex-tasks\"]",
|
|
"prompt": "How can using chain prompts help reduce errors and inconsistency in complex tasks handled by Claude?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1490,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#when-to-chain-prompts\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#chain-prompts-for-complex-tasks\"]",
|
|
"prompt": "How can using chain prompts help reduce errors and inconsistency in complex tasks handled by Claude?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1446,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#chain-prompts-for-complex-tasks\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#when-to-chain-prompts\"]",
|
|
"prompt": "How can using chain prompts help reduce errors and inconsistency in complex tasks handled by Claude?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3218,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can using chain prompts help reduce errors and inconsistency in complex tasks handled by Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#chain-prompts-for-complex-tasks\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #49"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#chain-prompts-for-complex-tasks\"]",
|
|
"How can using chain prompts help reduce errors and inconsistency in complex tasks handled by Claude?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #50",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#error-events\",\"https://docs.claude.com/en/api/streaming#error-event-types\",\"https://docs.claude.com/en/api/errors#http-errors\"]",
|
|
"prompt": "What HTTP status code does an overloaded_error event correspond to in a non-streaming context for the Claude API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1073,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#error-events\",\"https://docs.claude.com/en/api/streaming#error-event-types\",\"https://docs.claude.com/en/api/errors#http-errors\"]",
|
|
"prompt": "What HTTP status code does an overloaded_error event correspond to in a non-streaming context for the Claude API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1205,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#error-events\",\"https://docs.claude.com/en/api/streaming#error-event-types\",\"https://docs.claude.com/en/api/errors#http-errors\"]",
|
|
"prompt": "What HTTP status code does an overloaded_error event correspond to in a non-streaming context for the Claude API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What HTTP status code does an overloaded_error event correspond to in a non-streaming context for the Claude API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/streaming#error-event-types\",\"https://docs.claude.com/en/api/messages-streaming#error-events\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #50"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/streaming#error-event-types\",\"https://docs.claude.com/en/api/messages-streaming#error-events\"]",
|
|
"What HTTP status code does an overloaded_error event correspond to in a non-streaming context for the Claude API?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #51",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#getting-started-with-voyage-ai\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic\"]",
|
|
"prompt": "What are the two ways to specify the format in which Voyage AI returns embeddings through its HTTP API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1307,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#getting-started-with-voyage-ai\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic\"]",
|
|
"prompt": "What are the two ways to specify the format in which Voyage AI returns embeddings through its HTTP API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1260,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-python-package\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example\"]",
|
|
"prompt": "What are the two ways to specify the format in which Voyage AI returns embeddings through its HTTP API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3016,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are the two ways to specify the format in which Voyage AI returns embeddings through its HTTP API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #51"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api\"]",
|
|
"What are the two ways to specify the format in which Voyage AI returns embeddings through its HTTP API?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #52",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]",
|
|
"prompt": "When streaming API requests that use tools, how are the input JSON deltas for tool_use content blocks sent, and how can they be accumulated and parsed by the client?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1308,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]",
|
|
"prompt": "When streaming API requests that use tools, how are the input JSON deltas for tool_use content blocks sent, and how can they be accumulated and parsed by the client?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1372,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]",
|
|
"prompt": "When streaming API requests that use tools, how are the input JSON deltas for tool_use content blocks sent, and how can they be accumulated and parsed by the client?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "When streaming API requests that use tools, how are the input JSON deltas for tool_use content blocks sent, and how can they be accumulated and parsed by the client?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#streaming-request-with-tool-use\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #52"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#streaming-request-with-tool-use\"]",
|
|
"When streaming API requests that use tools, how are the input JSON deltas for tool_use content blocks sent, and how can they be accumulated and parsed by the client?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #53",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow\"]",
|
|
"prompt": "What are the two interactive prompt engineering tutorials that Anthropic offers, and how do they differ?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1363,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow\"]",
|
|
"prompt": "What are the two interactive prompt engineering tutorials that Anthropic offers, and how do they differ?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1322,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prompt-generator#next-steps\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial\"]",
|
|
"prompt": "What are the two interactive prompt engineering tutorials that Anthropic offers, and how do they differ?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3510,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are the two interactive prompt engineering tutorials that Anthropic offers, and how do they differ?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #53"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\"]",
|
|
"What are the two interactive prompt engineering tutorials that Anthropic offers, and how do they differ?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #54",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/intro-to-claude#enterprise-considerations\",\"https://docs.claude.com/en/docs/intro-to-claude#what-you-can-do-with-claude\",\"https://docs.claude.com/en/docs/intro-to-claude#model-options\"]",
|
|
"prompt": "What are some of the key capabilities that make Claude suitable for enterprise use cases requiring integration with specialized applications and processing of large volumes of sensitive data?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1406,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/intro-to-claude#model-options\",\"https://docs.claude.com/en/docs/intro-to-claude#enterprise-considerations\",\"https://docs.claude.com/en/docs/intro-to-claude#implementing-claude\"]",
|
|
"prompt": "What are some of the key capabilities that make Claude suitable for enterprise use cases requiring integration with specialized applications and processing of large volumes of sensitive data?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1275,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/intro-to-claude#enterprise-considerations\",\"https://docs.claude.com/en/docs/intro-to-claude#what-you-can-do-with-claude\",\"https://docs.claude.com/en/docs/intro-to-claude#model-options\"]",
|
|
"prompt": "What are some of the key capabilities that make Claude suitable for enterprise use cases requiring integration with specialized applications and processing of large volumes of sensitive data?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are some of the key capabilities that make Claude suitable for enterprise use cases requiring integration with specialized applications and processing of large volumes of sensitive data?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/intro-to-claude#enterprise-considerations\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #54"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/intro-to-claude#enterprise-considerations\"]",
|
|
"What are some of the key capabilities that make Claude suitable for enterprise use cases requiring integration with specialized applications and processing of large volumes of sensitive data?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #55",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024\",\"https://docs.claude.com/en/api/claude-on-vertex-ai#model-availability\"]",
|
|
"prompt": "As of June 2024, in which regions are Anthropic's Claude.ai API and iOS app available?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1268,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 0.6666666666666666 \n F1 Score: 0.6666666666666666 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024\",\"https://docs.claude.com/en/api/claude-on-vertex-ai#model-availability\"]",
|
|
"prompt": "As of June 2024, in which regions are Anthropic's Claude.ai API and iOS app available?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1303,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 0.6666666666666666 \n F1 Score: 0.6666666666666666 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#may-1st-2024\"]",
|
|
"prompt": "As of June 2024, in which regions are Anthropic's Claude.ai API and iOS app available?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 8043,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "As of June 2024, in which regions are Anthropic's Claude.ai API and iOS app available?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/claude-apps#may-1st-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #55"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/release-notes/claude-apps#may-1st-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024\"]",
|
|
"As of June 2024, in which regions are Anthropic's Claude.ai API and iOS app available?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #56",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#advantages-of-using-claude\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#introduction\"]",
|
|
"prompt": "What are the two main approaches for integrating Claude into a support ticket workflow, and how do they differ in terms of scalability and ease of implementation?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1565,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#introduction\"]",
|
|
"prompt": "What are the two main approaches for integrating Claude into a support ticket workflow, and how do they differ in terms of scalability and ease of implementation?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1540,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#adapting-to-common-scenarios\"]",
|
|
"prompt": "What are the two main approaches for integrating Claude into a support ticket workflow, and how do they differ in terms of scalability and ease of implementation?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 8149,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are the two main approaches for integrating Claude into a support ticket workflow, and how do they differ in terms of scalability and ease of implementation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#introduction\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #56"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#introduction\"]",
|
|
"What are the two main approaches for integrating Claude into a support ticket workflow, and how do they differ in terms of scalability and ease of implementation?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #57",
|
|
"outputs": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources\",\"https://docs.claude.com/en/docs/welcome#develop-with-claude\",\"https://docs.claude.com/en/docs/intro-to-claude#start-building-with-claude\"]",
|
|
"prompt": "When did Anthropic release a prompt generator tool to help guide Claude in generating high-quality prompts, and through what interface is it available?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1194,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/release-notes/api#may-10th-2024\",\"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow\"]",
|
|
"prompt": "When did Anthropic release a prompt generator tool to help guide Claude in generating high-quality prompts, and through what interface is it available?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1253,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/release-notes/api#may-10th-2024\",\"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow\"]",
|
|
"prompt": "When did Anthropic release a prompt generator tool to help guide Claude in generating high-quality prompts, and through what interface is it available?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "When did Anthropic release a prompt generator tool to help guide Claude in generating high-quality prompts, and through what interface is it available?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#may-10th-2024\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #57"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/release-notes/api#may-10th-2024\"]",
|
|
"When did Anthropic release a prompt generator tool to help guide Claude in generating high-quality prompts, and through what interface is it available?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #58",
|
|
"outputs": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification\",\"https://docs.claude.com/en/docs/welcome#models\",\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\"]",
|
|
"prompt": "Which Claude 3 model provides the best balance of intelligence and speed for high-throughput tasks like sales forecasting and targeted marketing?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1350,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/intro-to-claude#claude-3-family\",\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\",\"https://docs.claude.com/en/docs/welcome#models\"]",
|
|
"prompt": "Which Claude 3 model provides the best balance of intelligence and speed for high-throughput tasks like sales forecasting and targeted marketing?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1257,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification\",\"https://docs.claude.com/en/docs/welcome#models\",\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\"]",
|
|
"prompt": "Which Claude 3 model provides the best balance of intelligence and speed for high-throughput tasks like sales forecasting and targeted marketing?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "Which Claude 3 model provides the best balance of intelligence and speed for high-throughput tasks like sales forecasting and targeted marketing?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-vertex-ai#api-model-names\",\"https://docs.claude.com/en/docs/intro-to-claude#claude-3-family\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #58"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/claude-on-vertex-ai#api-model-names\",\"https://docs.claude.com/en/docs/intro-to-claude#claude-3-family\"]",
|
|
"Which Claude 3 model provides the best balance of intelligence and speed for high-throughput tasks like sales forecasting and targeted marketing?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #59",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#faq\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models\"]",
|
|
"prompt": "How can you calculate the similarity between two Voyage embedding vectors, and what is this equivalent to since Voyage embeddings are normalized to length 1?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1318,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#faq\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models\"]",
|
|
"prompt": "How can you calculate the similarity between two Voyage embedding vectors, and what is this equivalent to since Voyage embeddings are normalized to length 1?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1290,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#faq\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models\"]",
|
|
"prompt": "How can you calculate the similarity between two Voyage embedding vectors, and what is this equivalent to since Voyage embeddings are normalized to length 1?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can you calculate the similarity between two Voyage embedding vectors, and what is this equivalent to since Voyage embeddings are normalized to length 1?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#faq\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #59"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#faq\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example\"]",
|
|
"How can you calculate the similarity between two Voyage embedding vectors, and what is this equivalent to since Voyage embeddings are normalized to length 1?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #60",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#iterating-your-prompt-for-better-performance\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\"]",
|
|
"prompt": "How can using examples in prompts improve Claude's performance on complex tasks?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1433,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#iterating-your-prompt-for-better-performance\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\"]",
|
|
"prompt": "How can using examples in prompts improve Claude's performance on complex tasks?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1395,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#iterating-your-prompt-for-better-performance\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#adapting-to-common-scenarios\"]",
|
|
"prompt": "How can using examples in prompts improve Claude's performance on complex tasks?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3354,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can using examples in prompts improve Claude's performance on complex tasks?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#chain-prompts-for-complex-tasks\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #60"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#chain-prompts-for-complex-tasks\"]",
|
|
"How can using examples in prompts improve Claude's performance on complex tasks?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #61",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#delta-types\",\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]",
|
|
"prompt": "What are the two types of content block deltas that can be emitted when streaming responses with tool use, and what does each delta type contain?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1231,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "Precision: 1.0 \n Recall: 0.75 \n F1 Score: 0.8571428571428571 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.75,
|
|
"reason": "Recall is 0.75",
|
|
"named_scores": {
|
|
"Recall": 0.75
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "F1 is 0.8571428571428571",
|
|
"named_scores": {
|
|
"F1": 0.8571428571428571
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.75,
|
|
"reason": "Recall is 0.75",
|
|
"named_scores": {
|
|
"Recall": 0.75
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "F1 is 0.8571428571428571",
|
|
"named_scores": {
|
|
"F1": 0.8571428571428571
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#delta-types\",\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]",
|
|
"prompt": "What are the two types of content block deltas that can be emitted when streaming responses with tool use, and what does each delta type contain?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1250,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "Precision: 1.0 \n Recall: 0.75 \n F1 Score: 0.8571428571428571 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.75,
|
|
"reason": "Recall is 0.75",
|
|
"named_scores": {
|
|
"Recall": 0.75
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "F1 is 0.8571428571428571",
|
|
"named_scores": {
|
|
"F1": 0.8571428571428571
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.75,
|
|
"reason": "Recall is 0.75",
|
|
"named_scores": {
|
|
"Recall": 0.75
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "F1 is 0.8571428571428571",
|
|
"named_scores": {
|
|
"F1": 0.8571428571428571
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#delta-types\",\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]",
|
|
"prompt": "What are the two types of content block deltas that can be emitted when streaming responses with tool use, and what does each delta type contain?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "Precision: 1.0 \n Recall: 0.75 \n F1 Score: 0.8571428571428571 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.75,
|
|
"reason": "Recall is 0.75",
|
|
"named_scores": {
|
|
"Recall": 0.75
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "F1 is 0.8571428571428571",
|
|
"named_scores": {
|
|
"F1": 0.8571428571428571
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.75,
|
|
"reason": "Recall is 0.75",
|
|
"named_scores": {
|
|
"Recall": 0.75
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8571428571428571,
|
|
"reason": "F1 is 0.8571428571428571",
|
|
"named_scores": {
|
|
"F1": 0.8571428571428571
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are the two types of content block deltas that can be emitted when streaming responses with tool use, and what does each delta type contain?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#streaming-request-with-tool-use\",\"https://docs.claude.com/en/api/messages-streaming#delta-types\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #61"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#streaming-request-with-tool-use\",\"https://docs.claude.com/en/api/messages-streaming#delta-types\"]",
|
|
"What are the two types of content block deltas that can be emitted when streaming responses with tool use, and what does each delta type contain?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #62",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/intro-to-claude#what-you-can-do-with-claude\",\"https://docs.claude.com/en/docs/build-with-claude/text-generation#text-capabilities-and-use-cases\",\"https://docs.claude.com/en/docs/intro-to-claude#enterprise-considerations\"]",
|
|
"prompt": "What are two key capabilities of Claude that enable it to build interactive systems and personalized user experiences?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1302,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/intro-to-claude#what-you-can-do-with-claude\",\"https://docs.claude.com/en/docs/build-with-claude/text-generation#text-capabilities-and-use-cases\",\"https://docs.claude.com/en/docs/intro-to-claude#implementing-claude\"]",
|
|
"prompt": "What are two key capabilities of Claude that enable it to build interactive systems and personalized user experiences?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1387,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/text-generation#text-capabilities-and-use-cases\",\"https://docs.claude.com/en/docs/welcome#key-capabilities\",\"https://docs.claude.com/en/docs/intro-to-claude#what-you-can-do-with-claude\"]",
|
|
"prompt": "What are two key capabilities of Claude that enable it to build interactive systems and personalized user experiences?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3452,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are two key capabilities of Claude that enable it to build interactive systems and personalized user experiences?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/text-generation#text-capabilities-and-use-cases\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #62"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/text-generation#text-capabilities-and-use-cases\"]",
|
|
"What are two key capabilities of Claude that enable it to build interactive systems and personalized user experiences?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #63",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#raw-http-stream-response\",\"https://docs.claude.com/en/api/messages-streaming#event-types\",\"https://docs.claude.com/en/api/messages-streaming#basic-streaming-request\"]",
|
|
"prompt": "What are the key event types included in a raw HTTP stream response when using message streaming, and what is the typical order they occur in?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1475,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#raw-http-stream-response\",\"https://docs.claude.com/en/api/messages-streaming#event-types\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format\"]",
|
|
"prompt": "What are the key event types included in a raw HTTP stream response when using message streaming, and what is the typical order they occur in?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1282,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#raw-http-stream-response\",\"https://docs.claude.com/en/api/messages-streaming#event-types\",\"https://docs.claude.com/en/api/messages-streaming#basic-streaming-request\"]",
|
|
"prompt": "What are the key event types included in a raw HTTP stream response when using message streaming, and what is the typical order they occur in?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3489,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are the key event types included in a raw HTTP stream response when using message streaming, and what is the typical order they occur in?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#event-types\",\"https://docs.claude.com/en/api/messages-streaming#raw-http-stream-response\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #63"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/messages-streaming#event-types\",\"https://docs.claude.com/en/api/messages-streaming#raw-http-stream-response\"]",
|
|
"What are the key event types included in a raw HTTP stream response when using message streaming, and what is the typical order they occur in?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #64",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size\",\"https://docs.claude.com/en/api/messages-examples#vision\",\"https://docs.claude.com/en/docs/build-with-claude/vision#faq\"]",
|
|
"prompt": "What is the maximum number of images that can be included in a single request using the Claude API compared to the claude.ai interface?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1502,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/api/rate-limits#about-our-limits\",\"https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size\",\"https://docs.claude.com/en/api/messages-examples#vision\"]",
|
|
"prompt": "What is the maximum number of images that can be included in a single request using the Claude API compared to the claude.ai interface?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1489,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size\",\"https://docs.claude.com/en/docs/build-with-claude/vision#faq\",\"https://docs.claude.com/en/api/messages-examples#vision\"]",
|
|
"prompt": "What is the maximum number of images that can be included in a single request using the Claude API compared to the claude.ai interface?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3939,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What is the maximum number of images that can be included in a single request using the Claude API compared to the claude.ai interface?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#about-the-prompt-examples\",\"https://docs.claude.com/en/docs/build-with-claude/vision#faq\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #64"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/vision#about-the-prompt-examples\",\"https://docs.claude.com/en/docs/build-with-claude/vision#faq\"]",
|
|
"What is the maximum number of images that can be included in a single request using the Claude API compared to the claude.ai interface?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #65",
|
|
"outputs": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#pricing\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#next-steps\"]",
|
|
"prompt": "When Claude's response is cut off due to hitting the max_tokens limit and contains an incomplete tool use block, what should you do to get the full tool use?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1541,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#pricing\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors\"]",
|
|
"prompt": "When Claude's response is cut off due to hitting the max_tokens limit and contains an incomplete tool use block, what should you do to get the full tool use?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1512,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#forcing-tool-use\"]",
|
|
"prompt": "When Claude's response is cut off due to hitting the max_tokens limit and contains an incomplete tool use block, what should you do to get the full tool use?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 7980,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "When Claude's response is cut off due to hitting the max_tokens limit and contains an incomplete tool use block, what should you do to get the full tool use?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #65"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors\"]",
|
|
"When Claude's response is cut off due to hitting the max_tokens limit and contains an incomplete tool use block, what should you do to get the full tool use?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #66",
|
|
"outputs": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#deploy-your-classifier\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics\"]",
|
|
"prompt": "What two steps are needed before running a classification evaluation on Claude according to the documentation?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1259,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#deploy-your-classifier\"]",
|
|
"prompt": "What two steps are needed before running a classification evaluation on Claude according to the documentation?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1278,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#deploy-your-classifier\"]",
|
|
"prompt": "What two steps are needed before running a classification evaluation on Claude according to the documentation?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What two steps are needed before running a classification evaluation on Claude according to the documentation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#3-run-your-eval\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#2-develop-your-test-cases\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #66"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#3-run-your-eval\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#2-develop-your-test-cases\"]",
|
|
"What two steps are needed before running a classification evaluation on Claude according to the documentation?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #67",
|
|
"outputs": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#controlling-claudes-output\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#advanced-use\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth\"]",
|
|
"prompt": "How can you use the content parameter in the messages list to influence Claude's response?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1462,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#advanced-use\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth\",\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\"]",
|
|
"prompt": "How can you use the content parameter in the messages list to influence Claude's response?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1437,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth\",\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#how-to-prefill-claudes-response\"]",
|
|
"prompt": "How can you use the content parameter in the messages list to influence Claude's response?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3727,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can you use the content parameter in the messages list to influence Claude's response?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#basic-request-and-response\",\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #67"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/messages-examples#basic-request-and-response\",\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\"]",
|
|
"How can you use the content parameter in the messages list to influence Claude's response?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #68",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak\"]",
|
|
"prompt": "What are two key advantages of prompt engineering over fine-tuning when it comes to model comprehension and general knowledge preservation?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1222,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.5 \n Recall: 0.5 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision is 0.5",
|
|
"named_scores": {
|
|
"Precision": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision is 0.5",
|
|
"named_scores": {
|
|
"Precision": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts\"]",
|
|
"prompt": "What are two key advantages of prompt engineering over fine-tuning when it comes to model comprehension and general knowledge preservation?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1229,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts\"]",
|
|
"prompt": "What are two key advantages of prompt engineering over fine-tuning when it comes to model comprehension and general knowledge preservation?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are two key advantages of prompt engineering over fine-tuning when it comes to model comprehension and general knowledge preservation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #68"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\"]",
|
|
"What are two key advantages of prompt engineering over fine-tuning when it comes to model comprehension and general knowledge preservation?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #69",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#api-model-names\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests\",\"https://docs.claude.com/en/api/claude-on-vertex-ai#model-availability\"]",
|
|
"prompt": "What are the two main steps to get started with making requests to Claude models on Anthropic's Bedrock API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1395,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#accessing-bedrock\",\"https://docs.claude.com/en/docs/welcome#get-started\",\"https://docs.claude.com/en/docs/quickstart#prerequisites\"]",
|
|
"prompt": "What are the two main steps to get started with making requests to Claude models on Anthropic's Bedrock API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1426,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#subscribe-to-anthropic-models\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests\",\"https://docs.claude.com/en/docs/quickstart#prerequisites\"]",
|
|
"prompt": "What are the two main steps to get started with making requests to Claude models on Anthropic's Bedrock API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 4005,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are the two main steps to get started with making requests to Claude models on Anthropic's Bedrock API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#install-and-configure-the-aws-cli\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #69"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#install-and-configure-the-aws-cli\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests\"]",
|
|
"What are the two main steps to get started with making requests to Claude models on Anthropic's Bedrock API?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #70",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#list-available-models\",\"https://docs.claude.com/en/api/claude-on-vertex-ai#model-availability\",\"https://docs.claude.com/en/docs/intro-to-claude#model-options\"]",
|
|
"prompt": "How can you check which Claude models are available in a specific AWS region using the AWS CLI?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1334,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#list-available-models\",\"https://docs.claude.com/en/api/claude-on-vertex-ai#model-availability\",\"https://docs.claude.com/en/docs/about-claude/models#model-names\"]",
|
|
"prompt": "How can you check which Claude models are available in a specific AWS region using the AWS CLI?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1312,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#list-available-models\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#api-model-names\",\"https://docs.claude.com/en/api/claude-on-vertex-ai#model-availability\"]",
|
|
"prompt": "How can you check which Claude models are available in a specific AWS region using the AWS CLI?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 8622,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can you check which Claude models are available in a specific AWS region using the AWS CLI?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#subscribe-to-anthropic-models\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#list-available-models\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #70"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#subscribe-to-anthropic-models\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#list-available-models\"]",
|
|
"How can you check which Claude models are available in a specific AWS region using the AWS CLI?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #71",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-python-package\"]",
|
|
"prompt": "What argument can be passed to the voyageai.Client.embed() method or the Voyage HTTP API to specify whether the input text is a query or a document?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1482,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-python-package\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example\"]",
|
|
"prompt": "What argument can be passed to the voyageai.Client.embed() method or the Voyage HTTP API to specify whether the input text is a query or a document?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1355,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-python-package\"]",
|
|
"prompt": "What argument can be passed to the voyageai.Client.embed() method or the Voyage HTTP API to specify whether the input text is a query or a document?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What argument can be passed to the voyageai.Client.embed() method or the Voyage HTTP API to specify whether the input text is a query or a document?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-python-package\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #71"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-python-package\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api\"]",
|
|
"What argument can be passed to the voyageai.Client.embed() method or the Voyage HTTP API to specify whether the input text is a query or a document?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #72",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#delta-types\"]",
|
|
"prompt": "How do the streaming API delta formats differ between tool_use content blocks and text content blocks?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1537,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#delta-types\"]",
|
|
"prompt": "How do the streaming API delta formats differ between tool_use content blocks and text content blocks?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1577,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#delta-types\"]",
|
|
"prompt": "How do the streaming API delta formats differ between tool_use content blocks and text content blocks?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3352,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How do the streaming API delta formats differ between tool_use content blocks and text content blocks?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #72"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]",
|
|
"How do the streaming API delta formats differ between tool_use content blocks and text content blocks?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #73",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#faq\",\"https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size\",\"https://docs.claude.com/en/docs/build-with-claude/vision#ensuring-image-quality\"]",
|
|
"prompt": "What are the image file size limits when uploading images to Claude using the API versus on claude.ai?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1222,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#faq\",\"https://docs.claude.com/en/api/rate-limits#about-our-limits\",\"https://docs.claude.com/en/docs/build-with-claude/vision#ensuring-image-quality\"]",
|
|
"prompt": "What are the image file size limits when uploading images to Claude using the API versus on claude.ai?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1250,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#faq\",\"https://docs.claude.com/en/api/rate-limits#about-our-limits\",\"https://docs.claude.com/en/docs/build-with-claude/vision#ensuring-image-quality\"]",
|
|
"prompt": "What are the image file size limits when uploading images to Claude using the API versus on claude.ai?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are the image file size limits when uploading images to Claude using the API versus on claude.ai?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#faq\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #73"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/vision#faq\"]",
|
|
"What are the image file size limits when uploading images to Claude using the API versus on claude.ai?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #74",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification\",\"https://docs.claude.com/en/docs/intro-to-claude#model-options\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\"]",
|
|
"prompt": "What is one key consideration when selecting a Claude model for an enterprise use case that needs low latency?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1368,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/intro-to-claude#model-options\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification\"]",
|
|
"prompt": "What is one key consideration when selecting a Claude model for an enterprise use case that needs low latency?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1295,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#choosing-the-right-model\",\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\"]",
|
|
"prompt": "What is one key consideration when selecting a Claude model for an enterprise use case that needs low latency?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3757,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What is one key consideration when selecting a Claude model for an enterprise use case that needs low latency?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/intro-to-claude#model-options\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #74"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/intro-to-claude#model-options\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\"]",
|
|
"What is one key consideration when selecting a Claude model for an enterprise use case that needs low latency?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #75",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example\"]",
|
|
"prompt": "What embedding model does Anthropic recommend for code retrieval, and how does its performance compare to alternatives according to Voyage AI?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1295,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example\"]",
|
|
"prompt": "What embedding model does Anthropic recommend for code retrieval, and how does its performance compare to alternatives according to Voyage AI?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1291,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example\"]",
|
|
"prompt": "What embedding model does Anthropic recommend for code retrieval, and how does its performance compare to alternatives according to Voyage AI?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What embedding model does Anthropic recommend for code retrieval, and how does its performance compare to alternatives according to Voyage AI?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #75"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models\"]",
|
|
"What embedding model does Anthropic recommend for code retrieval, and how does its performance compare to alternatives according to Voyage AI?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #76",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/welcome#develop-with-claude\",\"https://docs.claude.com/en/docs/quickstart#next-steps\",\"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook\"]",
|
|
"prompt": "What are two ways the Claude Cookbook can help developers learn to use Anthropic's APIs?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1373,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/quickstart#next-steps\",\"https://docs.claude.com/en/api/#accessing-the-api\",\"https://docs.claude.com/en/docs/welcome#develop-with-claude\"]",
|
|
"prompt": "What are two ways the Claude Cookbook can help developers learn to use Anthropic's APIs?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1494,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook\",\"https://docs.claude.com/en/docs/quickstart#next-steps\",\"https://docs.claude.com/en/docs/welcome#develop-with-claude\"]",
|
|
"prompt": "What are two ways the Claude Cookbook can help developers learn to use Anthropic's APIs?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 4931,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are two ways the Claude Cookbook can help developers learn to use Anthropic's APIs?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/welcome#develop-with-claude\",\"https://docs.claude.com/en/docs/quickstart#next-steps\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #76"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/welcome#develop-with-claude\",\"https://docs.claude.com/en/docs/quickstart#next-steps\"]",
|
|
"What are two ways the Claude Cookbook can help developers learn to use Anthropic's APIs?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #77",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/resources/glossary#rag-retrieval-augmented-generation\",\"https://docs.claude.com/en/docs/resources/glossary#context-window\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#use-retrieval-for-contextual-consistency\"]",
|
|
"prompt": "How does the size of the context window impact a language model's ability to utilize retrieval augmented generation (RAG)?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1607,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/resources/glossary#rag-retrieval-augmented-generation\",\"https://docs.claude.com/en/docs/resources/glossary#context-window\",\"https://docs.claude.com/en/docs/resources/glossary#tokens\"]",
|
|
"prompt": "How does the size of the context window impact a language model's ability to utilize retrieval augmented generation (RAG)?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1554,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/resources/glossary#rag-retrieval-augmented-generation\",\"https://docs.claude.com/en/docs/resources/glossary#context-window\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#2-optimize-prompt-and-output-length\"]",
|
|
"prompt": "How does the size of the context window impact a language model's ability to utilize retrieval augmented generation (RAG)?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 9206,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How does the size of the context window impact a language model's ability to utilize retrieval augmented generation (RAG)?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#context-window\",\"https://docs.claude.com/en/docs/resources/glossary#rag-retrieval-augmented-generation\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #77"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/resources/glossary#context-window\",\"https://docs.claude.com/en/docs/resources/glossary#rag-retrieval-augmented-generation\"]",
|
|
"How does the size of the context window impact a language model's ability to utilize retrieval augmented generation (RAG)?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #78",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\",\"https://docs.claude.com/en/docs/intro-to-claude#implementing-claude\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#why-use-claude-for-sheets\"]",
|
|
"prompt": "How can the Evaluation tool in Anthropic's Claude platform help improve prompts and build more robust AI applications?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1316,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\",\"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources\"]",
|
|
"prompt": "How can the Evaluation tool in Anthropic's Claude platform help improve prompts and build more robust AI applications?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1334,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\",\"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources\"]",
|
|
"prompt": "How can the Evaluation tool in Anthropic's Claude platform help improve prompts and build more robust AI applications?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can the Evaluation tool in Anthropic's Claude platform help improve prompts and build more robust AI applications?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\",\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#creating-test-cases\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #78"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\",\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#creating-test-cases\"]",
|
|
"How can the Evaluation tool in Anthropic's Claude platform help improve prompts and build more robust AI applications?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #79",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification\",\"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison\"]",
|
|
"prompt": "Which Claude model has the fastest comparative latency according to the comparison tables?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1229,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification\",\"https://docs.claude.com/en/docs/welcome#models\"]",
|
|
"prompt": "Which Claude model has the fastest comparative latency according to the comparison tables?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1253,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification\",\"https://docs.claude.com/en/docs/welcome#models\"]",
|
|
"prompt": "Which Claude model has the fastest comparative latency according to the comparison tables?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "Which Claude model has the fastest comparative latency according to the comparison tables?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\",\"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #79"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\",\"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison\"]",
|
|
"Which Claude model has the fastest comparative latency according to the comparison tables?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #80",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\",\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\",\"https://docs.claude.com/en/api/client-sdks#python\"]",
|
|
"prompt": "How can you build up a conversation with multiple turns using the Anthropic Messages API in Python?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1414,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\",\"https://docs.claude.com/en/api/client-sdks#python\",\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\"]",
|
|
"prompt": "How can you build up a conversation with multiple turns using the Anthropic Messages API in Python?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1410,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\",\"https://docs.claude.com/en/api/client-sdks#python\",\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\"]",
|
|
"prompt": "How can you build up a conversation with multiple turns using the Anthropic Messages API in Python?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 5221,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can you build up a conversation with multiple turns using the Anthropic Messages API in Python?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/client-sdks#python\",\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #80"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/client-sdks#python\",\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\"]",
|
|
"How can you build up a conversation with multiple turns using the Anthropic Messages API in Python?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #81",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#why-use-role-prompting\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#why-use-xml-tags\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-1-legal-contract-analysis\"]",
|
|
"prompt": "How can using XML tags to provide a specific role or context help improve Claude's analysis of a legal contract compared to not using a role prompt?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1307,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#why-use-role-prompting\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#why-use-xml-tags\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-1-legal-contract-analysis\"]",
|
|
"prompt": "How can using XML tags to provide a specific role or context help improve Claude's analysis of a legal contract compared to not using a role prompt?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1296,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#why-use-role-prompting\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#why-use-xml-tags\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-1-legal-contract-analysis\"]",
|
|
"prompt": "How can using XML tags to provide a specific role or context help improve Claude's analysis of a legal contract compared to not using a role prompt?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can using XML tags to provide a specific role or context help improve Claude's analysis of a legal contract compared to not using a role prompt?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#examples\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-1-legal-contract-analysis\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #81"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#examples\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-1-legal-contract-analysis\"]",
|
|
"How can using XML tags to provide a specific role or context help improve Claude's analysis of a legal contract compared to not using a role prompt?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #82",
|
|
"outputs": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#choosing-a-model\",\"https://docs.claude.com/en/docs/intro-to-claude#claude-3-family\",\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\"]",
|
|
"prompt": "What are the key differences between how Claude 3 Opus and Claude 3 Sonnet handle missing information when making tool calls?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1328,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#choosing-a-model\",\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\",\"https://docs.claude.com/en/docs/intro-to-claude#claude-3-family\"]",
|
|
"prompt": "What are the key differences between how Claude 3 Opus and Claude 3 Sonnet handle missing information when making tool calls?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1248,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#choosing-a-model\",\"https://docs.claude.com/en/docs/intro-to-claude#claude-3-family\",\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\"]",
|
|
"prompt": "What are the key differences between how Claude 3 Opus and Claude 3 Sonnet handle missing information when making tool calls?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are the key differences between how Claude 3 Opus and Claude 3 Sonnet handle missing information when making tool calls?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#chain-of-thought\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#tool-use-examples\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #82"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#chain-of-thought\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#tool-use-examples\"]",
|
|
"What are the key differences between how Claude 3 Opus and Claude 3 Sonnet handle missing information when making tool calls?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #83",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#introduction\"]",
|
|
"prompt": "What steps should be taken to ensure a reliable deployment of an automated ticket routing system using Claude into a production environment?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1513,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#introduction\"]",
|
|
"prompt": "What steps should be taken to ensure a reliable deployment of an automated ticket routing system using Claude into a production environment?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1449,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#defining-the-task\"]",
|
|
"prompt": "What steps should be taken to ensure a reliable deployment of an automated ticket routing system using Claude into a production environment?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3533,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What steps should be taken to ensure a reliable deployment of an automated ticket routing system using Claude into a production environment?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #83"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\"]",
|
|
"What steps should be taken to ensure a reliable deployment of an automated ticket routing system using Claude into a production environment?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #84",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology\"]",
|
|
"prompt": "How should you evaluate a model's performance on a ticket routing classifier?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 6594,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations\"]",
|
|
"prompt": "How should you evaluate a model's performance on a ticket routing classifier?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1315,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#implement-claude-for-classification\"]",
|
|
"prompt": "How should you evaluate a model's performance on a ticket routing classifier?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3437,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How should you evaluate a model's performance on a ticket routing classifier?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #84"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\"]",
|
|
"How should you evaluate a model's performance on a ticket routing classifier?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #85",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial\"]",
|
|
"prompt": "What two methods does Anthropic recommend for learning how to prompt engineer with Claude before diving into the techniques?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1085,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial\"]",
|
|
"prompt": "What two methods does Anthropic recommend for learning how to prompt engineer with Claude before diving into the techniques?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-workflow\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial\"]",
|
|
"prompt": "What two methods does Anthropic recommend for learning how to prompt engineer with Claude before diving into the techniques?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What two methods does Anthropic recommend for learning how to prompt engineer with Claude before diving into the techniques?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #85"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\"]",
|
|
"What two methods does Anthropic recommend for learning how to prompt engineer with Claude before diving into the techniques?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #86",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/welcome#models\",\"https://docs.claude.com/en/docs/resources/glossary#pretraining\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#advantages-of-using-claude\"]",
|
|
"prompt": "What are the key differences between a pretrained large language model and Claude in terms of their training and capabilities?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1370,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/welcome#models\",\"https://docs.claude.com/en/docs/resources/glossary#pretraining\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#advantages-of-using-claude\"]",
|
|
"prompt": "What are the key differences between a pretrained large language model and Claude in terms of their training and capabilities?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1219,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/resources/glossary#pretraining\",\"https://docs.claude.com/en/docs/resources/glossary#llm\",\"https://docs.claude.com/en/docs/resources/glossary#rlhf\"]",
|
|
"prompt": "What are the key differences between a pretrained large language model and Claude in terms of their training and capabilities?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3803,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are the key differences between a pretrained large language model and Claude in terms of their training and capabilities?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#llm\",\"https://docs.claude.com/en/docs/resources/glossary#pretraining\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #86"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/resources/glossary#llm\",\"https://docs.claude.com/en/docs/resources/glossary#pretraining\"]",
|
|
"What are the key differences between a pretrained large language model and Claude in terms of their training and capabilities?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #87",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#pretraining\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak\"]",
|
|
"prompt": "What are some key advantages of using prompt engineering instead of fine-tuning to adapt a pretrained language model for a specific task or domain?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1375,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 0.6666666666666666 \n F1 Score: 0.6666666666666666 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#pretraining\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak\"]",
|
|
"prompt": "What are some key advantages of using prompt engineering instead of fine-tuning to adapt a pretrained language model for a specific task or domain?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1393,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 0.6666666666666666 \n F1 Score: 0.6666666666666666 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#pretraining\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak\"]",
|
|
"prompt": "What are some key advantages of using prompt engineering instead of fine-tuning to adapt a pretrained language model for a specific task or domain?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 0.6666666666666666 \n F1 Score: 0.6666666666666666 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are some key advantages of using prompt engineering instead of fine-tuning to adapt a pretrained language model for a specific task or domain?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#pretraining\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #87"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#pretraining\"]",
|
|
"What are some key advantages of using prompt engineering instead of fine-tuning to adapt a pretrained language model for a specific task or domain?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #88",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/claude-on-vertex-ai#making-requests\",\"https://docs.claude.com/en/docs/about-claude/models#get-started-with-claude\",\"https://docs.claude.com/en/api/claude-on-vertex-ai#api-model-names\"]",
|
|
"prompt": "How can you authenticate with GCP before running requests to access Claude models on Vertex AI?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1721,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/claude-on-vertex-ai#accessing-vertex-ai\",\"https://docs.claude.com/en/api/claude-on-vertex-ai#making-requests\",\"https://docs.claude.com/en/docs/about-claude/models#get-started-with-claude\"]",
|
|
"prompt": "How can you authenticate with GCP before running requests to access Claude models on Vertex AI?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1621,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/claude-on-vertex-ai#making-requests\",\"https://docs.claude.com/en/api/claude-on-vertex-ai#install-an-sdk-for-accessing-vertex-ai\",\"https://docs.claude.com/en/api/claude-on-vertex-ai#model-availability\"]",
|
|
"prompt": "How can you authenticate with GCP before running requests to access Claude models on Vertex AI?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3309,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How can you authenticate with GCP before running requests to access Claude models on Vertex AI?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-vertex-ai#making-requests\",\"https://docs.claude.com/en/api/claude-on-vertex-ai#accessing-vertex-ai\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #88"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/claude-on-vertex-ai#making-requests\",\"https://docs.claude.com/en/api/claude-on-vertex-ai#accessing-vertex-ai\"]",
|
|
"How can you authenticate with GCP before running requests to access Claude models on Vertex AI?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #89",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources\",\"https://docs.claude.com/en/docs/quickstart#next-steps\",\"https://docs.claude.com/en/release-notes/api#may-10th-2024\"]",
|
|
"prompt": "What new capabilities and features were introduced by Anthropic on May 10th, 2024 and how do they enable users to create and tailor prompts for specific tasks?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1471,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/release-notes/api#may-10th-2024\",\"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources\",\"https://docs.claude.com/en/docs/quickstart#next-steps\"]",
|
|
"prompt": "What new capabilities and features were introduced by Anthropic on May 10th, 2024 and how do they enable users to create and tailor prompts for specific tasks?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1466,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/release-notes/api#may-10th-2024\",\"https://docs.claude.com/en/docs/welcome#develop-with-claude\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#before-prompt-engineering\"]",
|
|
"prompt": "What new capabilities and features were introduced by Anthropic on May 10th, 2024 and how do they enable users to create and tailor prompts for specific tasks?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3911,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What new capabilities and features were introduced by Anthropic on May 10th, 2024 and how do they enable users to create and tailor prompts for specific tasks?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#may-10th-2024\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #89"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/release-notes/api#may-10th-2024\"]",
|
|
"What new capabilities and features were introduced by Anthropic on May 10th, 2024 and how do they enable users to create and tailor prompts for specific tasks?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #90",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/release-notes/claude-apps#june-20th-2024\",\"https://docs.claude.com/en/release-notes/api#june-20th-2024\",\"https://docs.claude.com/en/docs/intro-to-claude#claude-3-5-family\"]",
|
|
"prompt": "On what date did both the Claude 3.5 Sonnet model and the Artifacts feature in Claude.ai become available?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1367,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/release-notes/claude-apps#june-20th-2024\",\"https://docs.claude.com/en/release-notes/api#june-20th-2024\",\"https://docs.claude.com/en/docs/intro-to-claude#claude-3-5-family\"]",
|
|
"prompt": "On what date did both the Claude 3.5 Sonnet model and the Artifacts feature in Claude.ai become available?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1388,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/release-notes/claude-apps#june-20th-2024\",\"https://docs.claude.com/en/release-notes/api#june-20th-2024\",\"https://docs.claude.com/en/docs/intro-to-claude#claude-3-5-family\"]",
|
|
"prompt": "On what date did both the Claude 3.5 Sonnet model and the Artifacts feature in Claude.ai become available?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3867,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "On what date did both the Claude 3.5 Sonnet model and the Artifacts feature in Claude.ai become available?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#june-20th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#june-20th-2024\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #90"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/release-notes/api#june-20th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#june-20th-2024\"]",
|
|
"On what date did both the Claude 3.5 Sonnet model and the Artifacts feature in Claude.ai become available?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #91",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#controlling-claudes-output\",\"https://docs.claude.com/en/docs/resources/glossary#tokens\"]",
|
|
"prompt": "When putting words in Claude's mouth to shape the response, what header and value can you use in the request to limit Claude's response to a single token?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1305,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\",\"https://docs.claude.com/en/api/messages-examples#basic-request-and-response\",\"https://docs.claude.com/en/docs/resources/glossary#tokens\"]",
|
|
"prompt": "When putting words in Claude's mouth to shape the response, what header and value can you use in the request to limit Claude's response to a single token?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1243,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 1.0 \n F1 Score: 0.8 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.8,
|
|
"reason": "F1 is 0.8",
|
|
"named_scores": {
|
|
"F1": 0.8
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\",\"https://docs.claude.com/en/api/rate-limits#rate-limits\",\"https://docs.claude.com/en/docs/resources/glossary#tokens\"]",
|
|
"prompt": "When putting words in Claude's mouth to shape the response, what header and value can you use in the request to limit Claude's response to a single token?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 4162,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "When putting words in Claude's mouth to shape the response, what header and value can you use in the request to limit Claude's response to a single token?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#basic-request-and-response\",\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #91"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/messages-examples#basic-request-and-response\",\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\"]",
|
|
"When putting words in Claude's mouth to shape the response, what header and value can you use in the request to limit Claude's response to a single token?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #92",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/resources/glossary#temperature\",\"https://docs.claude.com/en/docs/welcome#models\",\"https://docs.claude.com/en/docs/resources/glossary#tokens\"]",
|
|
"prompt": "What does the temperature parameter do when working with large language models?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1355,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/resources/glossary#temperature\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#controlling-claudes-output\",\"https://docs.claude.com/en/docs/welcome#models\"]",
|
|
"prompt": "What does the temperature parameter do when working with large language models?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1363,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/resources/glossary#temperature\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#controlling-claudes-output\",\"https://docs.claude.com/en/docs/welcome#models\"]",
|
|
"prompt": "What does the temperature parameter do when working with large language models?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What does the temperature parameter do when working with large language models?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#temperature\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#2-optimize-prompt-and-output-length\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #92"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/resources/glossary#temperature\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#2-optimize-prompt-and-output-length\"]",
|
|
"What does the temperature parameter do when working with large language models?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #93",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#claude-for-sheets-usage-examples\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#get-started-with-claude-for-sheets\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#enter-your-first-prompt\"]",
|
|
"prompt": "What are two ways to specify API parameters when calling the Claude API using Claude for Sheets?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1675,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.3333333333333333 \n F1 Score: 0.3333333333333333 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Recall is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Recall": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "F1 is 0.3333333333333333",
|
|
"named_scores": {
|
|
"F1": 0.3333333333333333
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Recall is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Recall": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "F1 is 0.3333333333333333",
|
|
"named_scores": {
|
|
"F1": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#get-started-with-claude-for-sheets\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#enter-your-first-prompt\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#optional-function-parameters\"]",
|
|
"prompt": "What are two ways to specify API parameters when calling the Claude API using Claude for Sheets?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1635,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.3333333333333333 \n F1 Score: 0.3333333333333333 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Recall is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Recall": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "F1 is 0.3333333333333333",
|
|
"named_scores": {
|
|
"F1": 0.3333333333333333
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Recall is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Recall": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "F1 is 0.3333333333333333",
|
|
"named_scores": {
|
|
"F1": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#enter-your-first-prompt\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#optional-function-parameters\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#advanced-use\"]",
|
|
"prompt": "What are two ways to specify API parameters when calling the Claude API using Claude for Sheets?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3207,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.3333333333333333 \n F1 Score: 0.3333333333333333 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Recall is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Recall": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "F1 is 0.3333333333333333",
|
|
"named_scores": {
|
|
"F1": 0.3333333333333333
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Recall is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Recall": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "F1 is 0.3333333333333333",
|
|
"named_scores": {
|
|
"F1": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are two ways to specify API parameters when calling the Claude API using Claude for Sheets?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#tips-for-effective-evaluation\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#how-to-prefill-claudes-response\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#enter-your-first-prompt\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #93"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#tips-for-effective-evaluation\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#how-to-prefill-claudes-response\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#enter-your-first-prompt\"]",
|
|
"What are two ways to specify API parameters when calling the Claude API using Claude for Sheets?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #94",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#example-1-controlling-output-formatting-and-skipping-the-preamble\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#how-to-prefill-claudes-response\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#prefill-claudes-response\"]",
|
|
"prompt": "How does prefilling the response with an opening curly brace ({ ) affect Claude's output when extracting structured data from text?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1347,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#example-1-controlling-output-formatting-and-skipping-the-preamble\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#prefill-claudes-response\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#how-to-prefill-claudes-response\"]",
|
|
"prompt": "How does prefilling the response with an opening curly brace ({ ) affect Claude's output when extracting structured data from text?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1293,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#example-1-controlling-output-formatting-and-skipping-the-preamble\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#prefill-claudes-response\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#how-to-prefill-claudes-response\"]",
|
|
"prompt": "How does prefilling the response with an opening curly brace ({ ) affect Claude's output when extracting structured data from text?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 4071,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 1.0 \n F1 Score: 0.5 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "F1 is 0.5",
|
|
"named_scores": {
|
|
"F1": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How does prefilling the response with an opening curly brace ({ ) affect Claude's output when extracting structured data from text?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#example-1-controlling-output-formatting-and-skipping-the-preamble\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #94"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#example-1-controlling-output-formatting-and-skipping-the-preamble\"]",
|
|
"How does prefilling the response with an opening curly brace ({ ) affect Claude's output when extracting structured data from text?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #95",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#dive-deeper-into-vision\",\"https://docs.claude.com/en/docs/intro-to-claude#start-building-with-claude\",\"https://docs.claude.com/en/docs/welcome#develop-with-claude\"]",
|
|
"prompt": "What are some helpful resources provided by Anthropic to dive deeper into building with images using Claude?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1530,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#prompt-examples\",\"https://docs.claude.com/en/docs/build-with-claude/vision#dive-deeper-into-vision\",\"https://docs.claude.com/en/docs/welcome#develop-with-claude\"]",
|
|
"prompt": "What are some helpful resources provided by Anthropic to dive deeper into building with images using Claude?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1487,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#dive-deeper-into-vision\",\"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook\",\"https://docs.claude.com/en/api/messages-examples#vision\"]",
|
|
"prompt": "What are some helpful resources provided by Anthropic to dive deeper into building with images using Claude?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 5134,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are some helpful resources provided by Anthropic to dive deeper into building with images using Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#dive-deeper-into-vision\",\"https://docs.claude.com/en/docs/build-with-claude/vision#about-the-prompt-examples\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #95"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/build-with-claude/vision#dive-deeper-into-vision\",\"https://docs.claude.com/en/docs/build-with-claude/vision#about-the-prompt-examples\"]",
|
|
"What are some helpful resources provided by Anthropic to dive deeper into building with images using Claude?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #96",
|
|
"outputs": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/quickstart#set-your-api-key\",\"https://docs.claude.com/en/docs/quickstart#prerequisites\",\"https://docs.claude.com/en/api/#authentication\"]",
|
|
"prompt": "How do you specify the API key when creating a new Anthropic client in the Python and TypeScript SDK examples?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1355,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/#authentication\",\"https://docs.claude.com/en/docs/quickstart#set-your-api-key\",\"https://docs.claude.com/en/api/client-sdks#typescript\"]",
|
|
"prompt": "How do you specify the API key when creating a new Anthropic client in the Python and TypeScript SDK examples?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1357,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/#authentication\",\"https://docs.claude.com/en/docs/quickstart#set-your-api-key\",\"https://docs.claude.com/en/api/client-sdks#typescript\"]",
|
|
"prompt": "How do you specify the API key when creating a new Anthropic client in the Python and TypeScript SDK examples?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 1,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "How do you specify the API key when creating a new Anthropic client in the Python and TypeScript SDK examples?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/client-sdks#typescript\",\"https://docs.claude.com/en/api/client-sdks#python\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #96"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/client-sdks#typescript\",\"https://docs.claude.com/en/api/client-sdks#python\"]",
|
|
"How do you specify the API key when creating a new Anthropic client in the Python and TypeScript SDK examples?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #97",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#before-prompt-engineering\",\"https://docs.claude.com/en/docs/resources/glossary#hhh\"]",
|
|
"prompt": "What are two key benefits of using the Anthropic Evaluation tool when developing prompts for an AI classification application?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1538,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#before-prompt-engineering\",\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\",\"https://docs.claude.com/en/docs/build-with-claude/text-generation#more-resources\"]",
|
|
"prompt": "What are two key benefits of using the Anthropic Evaluation tool when developing prompts for an AI classification application?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1545,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#advantages-of-using-claude\",\"https://docs.claude.com/en/docs/about-claude/models#prompt-and-output-performance\"]",
|
|
"prompt": "What are two key benefits of using the Anthropic Evaluation tool when developing prompts for an AI classification application?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3868,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are two key benefits of using the Anthropic Evaluation tool when developing prompts for an AI classification application?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#2-develop-your-test-cases\",\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #97"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#2-develop-your-test-cases\",\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\"]",
|
|
"What are two key benefits of using the Anthropic Evaluation tool when developing prompts for an AI classification application?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #98",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/welcome#models\",\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\",\"https://docs.claude.com/en/docs/resources/glossary#pretraining\"]",
|
|
"prompt": "What are the key differences between a pretrained language model like Claude's underlying model, and the final version of Claude available through Anthropic's API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1350,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.3333333333333333 \n F1 Score: 0.3333333333333333 \n MRR: 0.3333333333333333",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Recall is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Recall": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "F1 is 0.3333333333333333",
|
|
"named_scores": {
|
|
"F1": 0.3333333333333333
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "MRR is 0.3333333333333333",
|
|
"named_scores": {
|
|
"MRR": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Recall is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Recall": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "F1 is 0.3333333333333333",
|
|
"named_scores": {
|
|
"F1": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"namedScores": {},
|
|
"text": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0\n---\n[\"https://docs.claude.com/en/docs/welcome#models\",\"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#advantages-of-using-claude\"]",
|
|
"prompt": "What are the key differences between a pretrained language model like Claude's underlying model, and the final version of Claude available through Anthropic's API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1310,
|
|
"gradingResult": {
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": false,
|
|
"score": 0,
|
|
"reason": "Precision: 0.0 \n Recall: 0.0 \n F1 Score: 0 \n MRR: 0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "MRR is 0",
|
|
"named_scores": {
|
|
"MRR": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Precision is 0.0",
|
|
"named_scores": {
|
|
"Precision": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "Recall is 0.0",
|
|
"named_scores": {
|
|
"Recall": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0,
|
|
"reason": "F1 is 0",
|
|
"named_scores": {
|
|
"F1": 0
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/resources/glossary#pretraining\",\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\",\"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison\"]",
|
|
"prompt": "What are the key differences between a pretrained language model like Claude's underlying model, and the final version of Claude available through Anthropic's API?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 3517,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision: 0.6666666666666666 \n Recall: 0.6666666666666666 \n F1 Score: 0.6666666666666666 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Precision is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Precision": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "Recall is 0.6666666666666666",
|
|
"named_scores": {
|
|
"Recall": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.6666666666666666,
|
|
"reason": "F1 is 0.6666666666666666",
|
|
"named_scores": {
|
|
"F1": 0.6666666666666666
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What are the key differences between a pretrained language model like Claude's underlying model, and the final version of Claude available through Anthropic's API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#pretraining\",\"https://docs.claude.com/en/docs/resources/glossary#llm\",\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #98"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/docs/resources/glossary#pretraining\",\"https://docs.claude.com/en/docs/resources/glossary#llm\",\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\"]",
|
|
"What are the key differences between a pretrained language model like Claude's underlying model, and the final version of Claude available through Anthropic's API?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #99",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/ip-addresses#ipv6\"]",
|
|
"prompt": "What is the IPv6 address range used by Anthropic?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1362,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/ip-addresses#ipv6\"]",
|
|
"prompt": "What is the IPv6 address range used by Anthropic?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1303,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/ip-addresses#ipv6\"]",
|
|
"prompt": "What is the IPv6 address range used by Anthropic?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 0,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision: 1.0 \n Recall: 1.0 \n F1 Score: 1.0 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Precision is 1.0",
|
|
"named_scores": {
|
|
"Precision": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "Recall is 1.0",
|
|
"named_scores": {
|
|
"Recall": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "F1 is 1.0",
|
|
"named_scores": {
|
|
"F1": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "What is the IPv6 address range used by Anthropic?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/ip-addresses#ipv6\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #99"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/ip-addresses#ipv6\"]",
|
|
"What is the IPv6 address range used by Anthropic?"
|
|
]
|
|
},
|
|
{
|
|
"description": "Row #100",
|
|
"outputs": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/api/client-sdks#python\",\"https://docs.claude.com/en/docs/quickstart#call-the-api\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#advanced-use\"]",
|
|
"prompt": "When using the Python SDK to create a message with Claude, what are two ways you can specify your API key?",
|
|
"provider": "python:provider_retrieval.py:retrieve_base",
|
|
"latencyMs": 1533,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 1.0",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 1,
|
|
"reason": "MRR is 1.0",
|
|
"named_scores": {
|
|
"MRR": 1
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/quickstart#set-your-api-key\",\"https://docs.claude.com/en/api/client-sdks#python\",\"https://docs.claude.com/en/api/client-sdks#typescript\"]",
|
|
"prompt": "When using the Python SDK to create a message with Claude, what are two ways you can specify your API key?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_two",
|
|
"latencyMs": 1535,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"namedScores": {},
|
|
"text": "[\"https://docs.claude.com/en/docs/quickstart#set-your-api-key\",\"https://docs.claude.com/en/api/client-sdks#python\",\"https://docs.claude.com/en/docs/quickstart#call-the-api\"]",
|
|
"prompt": "When using the Python SDK to create a message with Claude, what are two ways you can specify your API key?",
|
|
"provider": "python:provider_retrieval.py:retrieve_level_three",
|
|
"latencyMs": 4568,
|
|
"gradingResult": {
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "All assertions passed",
|
|
"namedScores": {},
|
|
"tokensUsed": {
|
|
"total": 0,
|
|
"prompt": 0,
|
|
"completion": 0
|
|
},
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "Precision: 0.3333333333333333 \n Recall: 0.5 \n F1 Score: 0.4 \n MRR: 0.5",
|
|
"componentResults": [
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
}
|
|
}
|
|
],
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "MRR is 0.5",
|
|
"named_scores": {
|
|
"MRR": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.3333333333333333,
|
|
"reason": "Precision is 0.3333333333333333",
|
|
"named_scores": {
|
|
"Precision": 0.3333333333333333
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.5,
|
|
"reason": "Recall is 0.5",
|
|
"named_scores": {
|
|
"Recall": 0.5
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
},
|
|
{
|
|
"pass": true,
|
|
"score": 0.4,
|
|
"reason": "F1 is 0.4",
|
|
"named_scores": {
|
|
"F1": 0.4
|
|
},
|
|
"assertion": {
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
}
|
|
],
|
|
"assertion": null
|
|
},
|
|
"cost": 0
|
|
}
|
|
],
|
|
"test": {
|
|
"vars": {
|
|
"query": "When using the Python SDK to create a message with Claude, what are two ways you can specify your API key?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\",\"https://docs.claude.com/en/api/client-sdks#python\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #100"
|
|
},
|
|
"vars": [
|
|
"[\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\",\"https://docs.claude.com/en/api/client-sdks#python\"]",
|
|
"When using the Python SDK to create a message with Claude, what are two ways you can specify your API key?"
|
|
]
|
|
}
|
|
]
|
|
}
|
|
},
|
|
"config": {
|
|
"description": "Retrieval - Base Eval",
|
|
"prompts": [
|
|
"{{ query }}"
|
|
],
|
|
"providers": [
|
|
"python:provider_retrieval.py:retrieve_base",
|
|
"python:provider_retrieval.py:retrieve_level_two",
|
|
"python:provider_retrieval.py:retrieve_level_three"
|
|
],
|
|
"tests": [
|
|
{
|
|
"vars": {
|
|
"query": "How can you create multiple test cases for an evaluation in the Anthropic Evaluation tool?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#creating-test-cases\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#building-evals-and-test-cases\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #1"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What embeddings provider does Anthropic recommend for customized domain-specific models, and what capabilities does this provider offer?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#before-implementing-embeddings\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #2"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are some key success metrics to consider when evaluating Claude's performance on a classification task, and how do they relate to choosing the right model to reduce latency?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#evaluation-metrics\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #3"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are two ways that Claude for Sheets can improve prompt engineering workflows compared to using chained prompts?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#why-use-claude-for-sheets\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #4"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What happens if a prompt for the Text Completions API is missing the \"\\n\\nHuman:\" and \"\\n\\nAssistant:\" turns?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt\",\"https://docs.claude.com/en/api/prompt-validation#examples\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #5"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How do the additional tokens required for tool use in Claude API requests impact pricing compared to regular API requests?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#pricing\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #6"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "When will the new Anthropic Developer Console features that show API usage, billing details, and rate limits be available?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#june-27th-2024\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #7"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "When deciding whether to use chain-of-thought (CoT) for a task, what are two key factors to consider in order to strike the right balance between performance and latency?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#why-not-let-claude-think\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-of-thought#before-implementing-cot\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #8"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can I use Claude to more easily digest the content of long PDF documents?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/text-generation#anthropic-cookbook\",\"https://docs.claude.com/en/docs/build-with-claude/vision#before-you-upload\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #9"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "According to the documentation, where can you view your organization's current API rate limits in the Claude Console?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/rate-limits#about-our-limits\",\"https://docs.claude.com/en/release-notes/api#june-27th-2024\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #10"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can we measure the performance of the ticket classification system implemented using Claude beyond just accuracy?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#prompting-claude-for-ticket-routing\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #11"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can you specify a system prompt using the Text Completions API versus the Messages API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/prompt-validation#examples\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#system-prompt\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #12"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can you combine XML tags with chain of thought reasoning to create high-performance prompts for Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#tagging-best-practices\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#chain-of-thought\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #13"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "When evaluating the Claude model's performance for ticket routing, what three key metrics are calculated and what are the results for the claude-3-haiku-20240307 model on the 91 test samples?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluation-methodology\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#example-data\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #14"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "Before starting to engineer and improve a prompt in Claude, what key things does Anthropic recommend you have in place first?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/define-success#next-steps\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#before-prompt-engineering\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #15"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How does the Messages API handle mid-response prompting compared to the Text Completions API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#inputs-and-outputs\",\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#putting-words-in-claudes-mouth\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #16"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How does Claude's response differ when given a role through a system prompt compared to not having a specific role in the financial analysis example?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-2-financial-analysis\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #17"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are some quantitative metrics that can be used to measure the success of a sentiment analysis model, and how might specific targets for those metrics be determined?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/define-success#building-strong-criteria\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #18"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What is a power user tip mentioned in the documentation for creating high-performance prompts using XML tags?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#tagging-best-practices\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #19"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can you use an LLM like Claude to automatically grade the outputs of other LLMs based on a rubric?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#tips-for-llm-based-grading\",\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #20"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can you access and deploy Voyage embeddings on AWS Marketplace?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-on-the-aws-marketplace\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #21"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "When using tools just to get Claude to produce JSON output following a particular schema, what key things should you do in terms of tool setup and prompting?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#tool-use-examples\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#json-output\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #22"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are the key differences between the legacy Claude Instant 1.2 model and the Claude 3 Haiku model in terms of capabilities and performance?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison\",\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\",\"https://docs.claude.com/en/docs/about-claude/models#legacy-models\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #23"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What is one key benefit of using examples when prompt engineering with Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #24"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "According to the Claude Documentation, what is one key advantage of using prompt engineering instead of fine-tuning when it comes to adapting an AI model to new domains or tasks?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #25"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can I quickly get started using the Claude for Sheets extension with a pre-made template?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#claude-for-sheets-workbook-template\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#get-started-with-claude-for-sheets\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #26"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How does the \"index\" field in the \"content_block_delta\" event relate to the text being streamed in a response?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#basic-streaming-request\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #27"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can you include an image as part of a Claude API request, and what image formats are currently supported?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#vision\",\"https://docs.claude.com/en/docs/build-with-claude/vision#about-the-prompt-examples\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #28"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What is the relationship between time to first token (TTFT) and latency when evaluating a language model's performance?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#ttft-time-to-first-token\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#how-to-measure-latency\",\"https://docs.claude.com/en/docs/resources/glossary#latency\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #29"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can providing Claude with examples of handling certain edge cases like implicit requests or emotional prioritization help improve its performance in routing support tickets?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#adapting-to-common-scenarios\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#prompting-claude-for-ticket-routing\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #30"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How does the stop_reason of \"tool_use\" relate to the overall workflow of integrating external tools with Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#tool-use-and-json-mode\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #31"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "According to the documentation, what error event and corresponding HTTP error code may be sent during periods of high usage for the Claude API when using streaming responses?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#error-events\",\"https://docs.claude.com/en/api/streaming#error-event-types\",\"https://docs.claude.com/en/api/errors#http-errors\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #32"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are the two types of deltas that can be contained in a content_block_delta event when streaming responses from the Claude API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#delta-types\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #33"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "On what date did Claude 3.5 Sonnet and tool use both become generally available across the Claude API, Amazon Bedrock, and Google Vertex AI?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#june-20th-2024\",\"https://docs.claude.com/en/release-notes/api#may-30th-2024\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #34"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "In what order did Anthropic launch Claude.ai and the Claude iOS app in Canada and Europe?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #35"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "When the API response from Claude has a stop_reason of \"tool_use\", what does this indicate and what should be done next to continue the conversation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#json-output\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#how-tool-use-works\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #36"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What Python libraries are used in the example code snippet for evaluating tone and style in a customer service chatbot?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#example-evals\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #37"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are the two main ways to authenticate when using the Anthropic Python SDK to access Claude models on Amazon Bedrock?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#install-an-sdk-for-accessing-bedrock\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #38"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "When deciding whether to implement leak-resistant prompt engineering strategies, what two factors should be considered and balanced?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#strategies-to-reduce-prompt-leak\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-prompt-leak#before-you-try-to-reduce-prompt-leak\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #39"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can selecting the appropriate Claude model based on your specific requirements help reduce latency in your application?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\",\"https://docs.claude.com/en/docs/intro-to-claude#model-options\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #40"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can you stream responses from the Claude API using the Python SDK?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#streaming-with-sdks\",\"https://docs.claude.com/en/api/client-sdks#python\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #41"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can you guide Claude's response by pre-filling part of the response, and what API parameter is used to generate a short response in this case?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\",\"https://docs.claude.com/en/api/messages-examples#basic-request-and-response\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #42"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What is more important when building an eval set for an AI system - having a larger number of test cases with automated grading, or having fewer high-quality test cases graded by humans?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#eval-design-principles\",\"https://docs.claude.com/en/docs/build-with-claude/develop-tests#building-evals-and-test-cases\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #43"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are the two required fields in a content_block_delta event for a text delta type?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#delta-types\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #44"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are two interactive ways to learn how to use Claude's capabilities, such as uploading PDFs and generating embeddings?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/quickstart#next-steps\",\"https://docs.claude.com/en/docs/welcome#develop-with-claude\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #45"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "Why does breaking a task into distinct subtasks for chained prompts help improve Claude's accuracy on the overall task?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#how-to-chain-prompts\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #46"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How does the streaming format for Messages responses differ from Text Completions streaming responses?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/migrating-from-text-completions-to-messages#streaming-format\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #47"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are two ways to start experimenting with Claude as a user, according to Anthropic's documentation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/models#get-started-with-claude\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #48"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can using chain prompts help reduce errors and inconsistency in complex tasks handled by Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/chain-prompts#why-chain-prompts\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#chain-prompts-for-complex-tasks\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #49"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What HTTP status code does an overloaded_error event correspond to in a non-streaming context for the Claude API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/streaming#error-event-types\",\"https://docs.claude.com/en/api/messages-streaming#error-events\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #50"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are the two ways to specify the format in which Voyage AI returns embeddings through its HTTP API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #51"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "When streaming API requests that use tools, how are the input JSON deltas for tool_use content blocks sent, and how can they be accumulated and parsed by the client?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#streaming-request-with-tool-use\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #52"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are the two interactive prompt engineering tutorials that Anthropic offers, and how do they differ?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#prompt-engineering-interactive-tutorial\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #53"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are some of the key capabilities that make Claude suitable for enterprise use cases requiring integration with specialized applications and processing of large volumes of sensitive data?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/intro-to-claude#enterprise-considerations\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #54"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "As of June 2024, in which regions are Anthropic's Claude.ai API and iOS app available?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/claude-apps#may-1st-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#june-5th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#may-13th-2024\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #55"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are the two main approaches for integrating Claude into a support ticket workflow, and how do they differ in terms of scalability and ease of implementation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#introduction\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #56"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "When did Anthropic release a prompt generator tool to help guide Claude in generating high-quality prompts, and through what interface is it available?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#may-10th-2024\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #57"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "Which Claude 3 model provides the best balance of intelligence and speed for high-throughput tasks like sales forecasting and targeted marketing?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-vertex-ai#api-model-names\",\"https://docs.claude.com/en/docs/intro-to-claude#claude-3-family\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #58"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can you calculate the similarity between two Voyage embedding vectors, and what is this equivalent to since Voyage embeddings are normalized to length 1?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#faq\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-embedding-example\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #59"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can using examples in prompts improve Claude's performance on complex tasks?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/multishot-prompting#why-use-examples\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency#chain-prompts-for-complex-tasks\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #60"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are the two types of content block deltas that can be emitted when streaming responses with tool use, and what does each delta type contain?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\",\"https://docs.claude.com/en/api/messages-streaming#streaming-request-with-tool-use\",\"https://docs.claude.com/en/api/messages-streaming#delta-types\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #61"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are two key capabilities of Claude that enable it to build interactive systems and personalized user experiences?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/text-generation#text-capabilities-and-use-cases\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #62"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are the key event types included in a raw HTTP stream response when using message streaming, and what is the typical order they occur in?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#event-types\",\"https://docs.claude.com/en/api/messages-streaming#raw-http-stream-response\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #63"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What is the maximum number of images that can be included in a single request using the Claude API compared to the claude.ai interface?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#about-the-prompt-examples\",\"https://docs.claude.com/en/docs/build-with-claude/vision#faq\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #64"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "When Claude's response is cut off due to hitting the max_tokens limit and contains an incomplete tool use block, what should you do to get the full tool use?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#troubleshooting-errors\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #65"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What two steps are needed before running a classification evaluation on Claude according to the documentation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#3-run-your-eval\",\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#2-develop-your-test-cases\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #66"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can you use the content parameter in the messages list to influence Claude's response?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#basic-request-and-response\",\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #67"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are two key advantages of prompt engineering over fine-tuning when it comes to model comprehension and general knowledge preservation?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #68"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are the two main steps to get started with making requests to Claude models on Anthropic's Bedrock API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#install-and-configure-the-aws-cli\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#making-requests\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #69"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can you check which Claude models are available in a specific AWS region using the AWS CLI?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#subscribe-to-anthropic-models\",\"https://docs.claude.com/en/api/claude-on-amazon-bedrock#list-available-models\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #70"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What argument can be passed to the voyageai.Client.embed() method or the Voyage HTTP API to specify whether the input text is a query or a document?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-python-package\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#voyage-http-api\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #71"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How do the streaming API delta formats differ between tool_use content blocks and text content blocks?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-streaming#input-json-delta\",\"https://docs.claude.com/en/api/messages-streaming#text-delta\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #72"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are the image file size limits when uploading images to Claude using the API versus on claude.ai?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#faq\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #73"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What is one key consideration when selecting a Claude model for an enterprise use case that needs low latency?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/intro-to-claude#model-options\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#1-choose-the-right-model\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #74"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What embedding model does Anthropic recommend for code retrieval, and how does its performance compare to alternatives according to Voyage AI?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/embeddings#how-to-get-embeddings-with-anthropic\",\"https://docs.claude.com/en/docs/build-with-claude/embeddings#available-voyage-models\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #75"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are two ways the Claude Cookbook can help developers learn to use Anthropic's APIs?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/welcome#develop-with-claude\",\"https://docs.claude.com/en/docs/quickstart#next-steps\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #76"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How does the size of the context window impact a language model's ability to utilize retrieval augmented generation (RAG)?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#context-window\",\"https://docs.claude.com/en/docs/resources/glossary#rag-retrieval-augmented-generation\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #77"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can the Evaluation tool in Anthropic's Claude platform help improve prompts and build more robust AI applications?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\",\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#creating-test-cases\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #78"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "Which Claude model has the fastest comparative latency according to the comparison tables?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/models#model-comparison\",\"https://docs.claude.com/en/docs/about-claude/models#legacy-model-comparison\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #79"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can you build up a conversation with multiple turns using the Anthropic Messages API in Python?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/client-sdks#python\",\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #80"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can using XML tags to provide a specific role or context help improve Claude's analysis of a legal contract compared to not using a role prompt?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/use-xml-tags#examples\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/system-prompts#example-1-legal-contract-analysis\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #81"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are the key differences between how Claude 3 Opus and Claude 3 Sonnet handle missing information when making tool calls?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/tool-use#chain-of-thought\",\"https://docs.claude.com/en/docs/build-with-claude/tool-use#tool-use-examples\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #82"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What steps should be taken to ensure a reliable deployment of an automated ticket routing system using Claude into a production environment?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#additional-considerations\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #83"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How should you evaluate a model's performance on a ticket routing classifier?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#evaluating-the-performance-of-your-ticket-routing-classifier\",\"https://docs.claude.com/en/docs/about-claude/use-cases/ticket-routing#integrate-claude-into-your-support-workflow\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #84"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What two methods does Anthropic recommend for learning how to prompt engineer with Claude before diving into the techniques?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#how-to-prompt-engineer\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#prompt-engineering-tutorial\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #85"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are the key differences between a pretrained large language model and Claude in terms of their training and capabilities?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#llm\",\"https://docs.claude.com/en/docs/resources/glossary#pretraining\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #86"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are some key advantages of using prompt engineering instead of fine-tuning to adapt a pretrained language model for a specific task or domain?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering#when-to-prompt-engineer\",\"https://docs.claude.com/en/docs/resources/glossary#pretraining\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #87"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How can you authenticate with GCP before running requests to access Claude models on Vertex AI?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/claude-on-vertex-ai#making-requests\",\"https://docs.claude.com/en/api/claude-on-vertex-ai#accessing-vertex-ai\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #88"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What new capabilities and features were introduced by Anthropic on May 10th, 2024 and how do they enable users to create and tailor prompts for specific tasks?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#may-10th-2024\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #89"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "On what date did both the Claude 3.5 Sonnet model and the Artifacts feature in Claude.ai become available?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/release-notes/api#june-20th-2024\",\"https://docs.claude.com/en/release-notes/claude-apps#june-20th-2024\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #90"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "When putting words in Claude's mouth to shape the response, what header and value can you use in the request to limit Claude's response to a single token?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#basic-request-and-response\",\"https://docs.claude.com/en/api/messages-examples#putting-words-in-claudes-mouth\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #91"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What does the temperature parameter do when working with large language models?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#temperature\",\"https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/reduce-latency#2-optimize-prompt-and-output-length\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #92"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are two ways to specify API parameters when calling the Claude API using Claude for Sheets?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#tips-for-effective-evaluation\",\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#how-to-prefill-claudes-response\",\"https://docs.claude.com/en/docs/build-with-claude/claude-for-sheets#enter-your-first-prompt\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #93"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How does prefilling the response with an opening curly brace ({ ) affect Claude's output when extracting structured data from text?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#example-1-controlling-output-formatting-and-skipping-the-preamble\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #94"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are some helpful resources provided by Anthropic to dive deeper into building with images using Claude?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/build-with-claude/vision#dive-deeper-into-vision\",\"https://docs.claude.com/en/docs/build-with-claude/vision#about-the-prompt-examples\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #95"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "How do you specify the API key when creating a new Anthropic client in the Python and TypeScript SDK examples?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/client-sdks#typescript\",\"https://docs.claude.com/en/api/client-sdks#python\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #96"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are two key benefits of using the Anthropic Evaluation tool when developing prompts for an AI classification application?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/about-claude/use-cases/classification#2-develop-your-test-cases\",\"https://docs.claude.com/en/docs/test-and-evaluate/eval-tool#understanding-results\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #97"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What are the key differences between a pretrained language model like Claude's underlying model, and the final version of Claude available through Anthropic's API?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/docs/resources/glossary#pretraining\",\"https://docs.claude.com/en/docs/resources/glossary#llm\",\"https://docs.claude.com/en/docs/resources/glossary#fine-tuning\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #98"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "What is the IPv6 address range used by Anthropic?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/ip-addresses#ipv6\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #99"
|
|
},
|
|
{
|
|
"vars": {
|
|
"query": "When using the Python SDK to create a message with Claude, what are two ways you can specify your API key?",
|
|
"correct_chunks": "[\"https://docs.claude.com/en/api/messages-examples#multiple-conversational-turns\",\"https://docs.claude.com/en/api/client-sdks#python\"]"
|
|
},
|
|
"assert": [
|
|
{
|
|
"type": "python",
|
|
"value": "file://eval_retrieval.py"
|
|
}
|
|
],
|
|
"options": {},
|
|
"description": "Row #100"
|
|
}
|
|
],
|
|
"scenarios": [],
|
|
"env": {},
|
|
"sharing": true,
|
|
"defaultTest": {
|
|
"vars": {},
|
|
"assert": [],
|
|
"options": {}
|
|
},
|
|
"outputPath": [
|
|
"../data/retrieval_results.json"
|
|
],
|
|
"metadata": {}
|
|
},
|
|
"shareableUrl": null
|
|
} |