updated code

2025-04-07 00:48:52 +03:00 · 2025-02-02 12:24:43 +02:00
parent f50f0e4373
commit 76a529eccf
5 changed files with 185 additions and 34 deletions
--- a/data/nike_2023_annual_report.txt
+++ b/data/nike_2023_annual_report.txt
@@ -1,5 +1,6 @@
 FORM 10-K FORM 10-KUNITED STATES
 SECURITIES AND EXCHANGE COMMISSION
+Washington, D.C.
 Washington, D.C. 20549
 FORM 10-K 
 (Mark One)
--- a/evaluation/evalute_rag.py
+++ b/evaluation/evalute_rag.py
@@ -14,12 +14,14 @@ Custom modules:
 """

 import json
-from typing import List, Tuple
+from typing import List, Tuple, Dict, Any

 from deepeval import evaluate
 from deepeval.metrics import GEval, FaithfulnessMetric, ContextualRelevancyMetric
 from deepeval.test_case import LLMTestCase, LLMTestCaseParams
 from langchain_openai import ChatOpenAI
+from langchain_core.prompts import PromptTemplate
+from langchain_core.output_parsers import StrOutputParser

 # 09/15/24 kimmeyh Added path where helper functions is located to the path
 # Add the parent directory to the path since we work with notebooks
@@ -90,41 +92,75 @@ relevance_metric = ContextualRelevancyMetric(
    include_reason=True
 )

-def evaluate_rag(chunks_query_retriever, num_questions: int = 5) -> None:
+def evaluate_rag(retriever, num_questions: int = 5) -> Dict[str, Any]:
    """
-    Evaluate the RAG system using predefined metrics.
-
+    Evaluates a RAG system using predefined test questions and metrics.
+    
    Args:
-        chunks_query_retriever: Function to retrieve context chunks for a given query.
-        num_questions (int): Number of questions to evaluate (default: 5).
+        retriever: The retriever component to evaluate
+        num_questions: Number of test questions to generate
+    
+    Returns:
+        Dict containing evaluation metrics
    """
-    llm = ChatOpenAI(temperature=0, model_name="gpt-4o", max_tokens=2000)
-    question_answer_from_context_chain = create_question_answer_from_context_chain(llm)
-
-    # Load questions and answers from JSON file
-    q_a_file_name = "../data/q_a.json"
-    with open(q_a_file_name, "r", encoding="utf-8") as json_file:
-        q_a = json.load(json_file)
-
-    questions = [qa["question"] for qa in q_a][:num_questions]
-    ground_truth_answers = [qa["answer"] for qa in q_a][:num_questions]
-    generated_answers = []
-    retrieved_documents = []
-
-    # Generate answers and retrieve documents for each question
-    for question in questions:
-        context = retrieve_context_per_question(question, chunks_query_retriever)
-        retrieved_documents.append(context)
-        context_string = " ".join(context)
-        result = answer_question_from_context(question, context_string, question_answer_from_context_chain)
-        generated_answers.append(result["answer"])
-
-    # Create test cases and evaluate
-    test_cases = create_deep_eval_test_cases(questions, ground_truth_answers, generated_answers, retrieved_documents)
-    evaluate(
-        test_cases=test_cases,
-        metrics=[correctness_metric, faithfulness_metric, relevance_metric]
+    
+    # Initialize LLM
+    llm = ChatOpenAI(temperature=0, model_name="gpt-4-turbo-preview")
+    
+    # Create evaluation prompt
+    eval_prompt = PromptTemplate.from_template("""
+    Evaluate the following retrieval results for the question.
+    
+    Question: {question}
+    Retrieved Context: {context}
+    
+    Rate on a scale of 1-5 (5 being best) for:
+    1. Relevance: How relevant is the retrieved information to the question?
+    2. Completeness: Does the context contain all necessary information?
+    3. Conciseness: Is the retrieved context focused and free of irrelevant information?
+    
+    Provide ratings in JSON format:
+    """)
+    
+    # Create evaluation chain
+    eval_chain = (
+        eval_prompt 
+        | llm 
+        | StrOutputParser()
    )
+    
+    # Generate test questions
+    question_gen_prompt = PromptTemplate.from_template(
+        "Generate {num_questions} diverse test questions about climate change:"
+    )
+    question_chain = question_gen_prompt | llm | StrOutputParser()
+    
+    questions = question_chain.invoke({"num_questions": num_questions}).split("\n")
+    
+    # Evaluate each question
+    results = []
+    for question in questions:
+        # Get retrieval results
+        context = retriever.get_relevant_documents(question)
+        context_text = "\n".join([doc.page_content for doc in context])
+        
+        # Evaluate results
+        eval_result = eval_chain.invoke({
+            "question": question,
+            "context": context_text
+        })
+        results.append(eval_result)
+    
+    return {
+        "questions": questions,
+        "results": results,
+        "average_scores": calculate_average_scores(results)
+    }
+
+def calculate_average_scores(results: List[Dict]) -> Dict[str, float]:
+    """Calculate average scores across all evaluation results."""
+    # Implementation depends on the exact format of your results
+    pass

 if __name__ == "__main__":
    # Add any necessary setup or configuration here
--- a/helper_functions.py
+++ b/helper_functions.py
@@ -17,7 +17,7 @@ from enum import Enum

 def replace_t_with_space(list_of_documents):
    """
-    Replaces all tab characters ('\t') with spaces in the page content of each document.
+    Replaces all tab characters ('\t') with spaces in the page content of each document

    Args:
        list_of_documents: A list of document objects, each with a 'page_content' attribute.
--- a/requirements.txt
+++ b/requirements.txt
@@ -208,3 +208,55 @@ nbformat==5.10.4
 xxhash==3.5.0
 yarl==1.10.0
 zipp==3.20.1
+
+# Core LangChain packages
+langchain>=0.1.0
+langchain-core>=0.1.17
+langchain-community>=0.0.13
+langchain-openai>=0.0.5
+langchain-anthropic>=0.0.9
+langchain-groq>=0.0.1
+langchain-cohere>=0.0.1
+
+# Vector stores and embeddings
+faiss-cpu>=1.7.4
+chromadb>=0.4.22
+
+# Document processing
+PyMuPDF>=1.23.8  # for fitz
+python-docx>=1.0.1
+pypdf>=3.17.4
+rank-bm25>=0.2.2
+
+# Machine Learning and Data Science
+numpy>=1.24.3
+pandas>=2.0.3
+scikit-learn>=1.3.0
+
+# API Clients
+openai>=1.12.0
+anthropic>=0.8.1
+cohere>=4.48
+groq>=0.4.2
+
+# Testing and Evaluation
+pytest>=7.4.0
+deepeval>=0.20.12
+grouse>=0.3.0
+
+# Development Tools
+python-dotenv>=1.0.0
+jupyter>=1.0.0
+notebook>=7.0.6
+ipykernel>=6.29.2
+
+# Type Checking
+pydantic>=2.6.1
+typing-extensions>=4.9.0
+
+# Async Support
+aiohttp>=3.9.1
+asyncio>=3.4.3
+
+# Utilities
+tqdm>=4.66.1
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,10 +1,18 @@
 import pytest
 import os
 import sys
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain_core.prompts import PromptTemplate
+from langchain_text_splitters import CharacterTextSplitter
+from dotenv import load_dotenv

 # Add the main folder to sys.path
 sys.path.append(os.path.abspath(os.path.dirname(__file__) + "/../"))

+# Load environment variables
+load_dotenv()
+
 def pytest_addoption(parser):
    parser.addoption(
        "--exclude", action="store", help="Comma-separated list of notebook or script files' paths to exclude"
@@ -40,4 +48,58 @@ def script_paths(request):
    
    path_with_full_address = [folder + s for s in include_scripts]
    
-    return path_with_full_address
+    return path_with_full_address
+
+@pytest.fixture(scope="session")
+def llm():
+    """Fixture for ChatOpenAI model."""
+    return ChatOpenAI(
+        temperature=0,
+        model_name="gpt-4-turbo-preview",
+        max_tokens=4000
+    )
+
+@pytest.fixture(scope="session")
+def embeddings():
+    """Fixture for OpenAI embeddings."""
+    return OpenAIEmbeddings()
+
+@pytest.fixture(scope="session")
+def text_splitter():
+    """Fixture for text splitter."""
+    return CharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200
+    )
+
+@pytest.fixture(scope="session")
+def sample_texts():
+    """Fixture for sample test data."""
+    return [
+        "The Earth is the third planet from the Sun.",
+        "Climate change is a significant global challenge.",
+        "Renewable energy sources include solar and wind power."
+    ]
+
+@pytest.fixture(scope="session")
+def vector_store(embeddings, sample_texts, text_splitter):
+    """Fixture for vector store."""
+    docs = text_splitter.create_documents(sample_texts)
+    return FAISS.from_documents(docs, embeddings)
+
+@pytest.fixture(scope="session")
+def retriever(vector_store):
+    """Fixture for retriever."""
+    return vector_store.as_retriever(search_kwargs={"k": 2})
+
+@pytest.fixture(scope="session")
+def basic_prompt():
+    """Fixture for basic prompt template."""
+    return PromptTemplate.from_template("""
+    Answer the following question based on the context provided:
+    
+    Context: {context}
+    Question: {question}
+    
+    Answer:
+    """)