mirror of
https://github.com/anthropics/claude-cookbooks.git
synced 2025-10-06 01:00:28 +03:00
96 lines
3.1 KiB
Python
96 lines
3.1 KiB
Python
import anthropic
|
|
import os
|
|
import json
|
|
from typing import Dict, TypedDict, Union, Any
|
|
|
|
def llm_eval(summary, input):
|
|
"""
|
|
Evaluate summary using an LLM (Claude).
|
|
|
|
Args:
|
|
summary (str): The summary to evaluate.
|
|
input (str): The original text that was summarized.
|
|
|
|
Returns:
|
|
bool: True if the average score is above the threshold, False otherwise.
|
|
"""
|
|
client = anthropic.Anthropic(api_key=os.getenv("CLAUDE_API_KEY"))
|
|
|
|
# You could include an example here too and likely improve performance further!
|
|
prompt = f"""Evaluate the following summary based on these criteria:
|
|
1. Conciseness (1-5)
|
|
2. Accuracy (1-5)
|
|
3. Completeness (1-5)
|
|
4. Clarity (1-5)
|
|
5. Explanation - a general description of the way the summary is evaluatied
|
|
|
|
Here are some things to think about as you go about grading.
|
|
|
|
1. Does the summary accurately capture the key provisions of the legal document?
|
|
2. Does the summary omit any important details from the legal document?
|
|
3. Does the summary contain any inaccuracies or misrepresentations of the legal document?
|
|
4. Does the summary fairly represent the legal document as a whole, or does it unduly emphasize certain provisions over others?
|
|
5. Does the summary accurately reflect the language and tone of the legal document?
|
|
6. Does the summary capture the key concepts and principles embodied in the legal document?
|
|
7. Does the summary omit any important ideas that should be captured to make decisions using the document?
|
|
|
|
Provide a score for each criterion in JSON format. Here is the format you should follow always:
|
|
|
|
<json>
|
|
{{
|
|
"conciseness": <number>,
|
|
"accuracy": <number>,
|
|
"completeness": <number>,
|
|
"clarity": <number>,
|
|
"explanation": <string>,
|
|
}}
|
|
</json>
|
|
|
|
Original Text: {input}
|
|
|
|
Summary to Evaluate: {summary}
|
|
|
|
Evaluation (JSON format):"""
|
|
|
|
response = client.messages.create(
|
|
model="claude-3-5-sonnet-20241022",
|
|
max_tokens=1000,
|
|
temperature=0,
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": prompt
|
|
},
|
|
{
|
|
"role": "assistant",
|
|
"content": "<json>"
|
|
}
|
|
],
|
|
stop_sequences=["</json>"]
|
|
)
|
|
|
|
evaluation = json.loads(response.content[0].text)
|
|
# Filter out non-numeric values and calculate the average
|
|
numeric_values = [value for key, value in evaluation.items() if isinstance(value, (int, float))]
|
|
avg_score = sum(numeric_values) / len(numeric_values)
|
|
return avg_score, evaluation['explanation']
|
|
|
|
def get_assert(output: str, context, threshold=0.5) -> Union[bool, float, Dict[str, Any]]:
|
|
input = context['vars']['input']
|
|
score, evaluation = llm_eval(output, input)
|
|
|
|
# 4 different dimensions we measure performance on
|
|
normalized_score = score / 4
|
|
|
|
if normalized_score >= threshold:
|
|
return {
|
|
"pass": True,
|
|
"score": score,
|
|
"reason": evaluation
|
|
}
|
|
else:
|
|
return {
|
|
"pass": False,
|
|
"score": score,
|
|
"reason": evaluation
|
|
} |