diff --git a/.github/workflows/claude-link-review.yml b/.github/workflows/claude-link-review.yml index 636232d..f84a929 100644 --- a/.github/workflows/claude-link-review.yml +++ b/.github/workflows/claude-link-review.yml @@ -25,7 +25,7 @@ jobs: - name: Run Claude Link Review uses: anthropics/claude-code-action@v1 with: - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} github_token: ${{ secrets.GITHUB_TOKEN }} prompt: "/link-review" claude_args: | diff --git a/.github/workflows/claude-model-check.yml b/.github/workflows/claude-model-check.yml index 6d96573..e23464b 100644 --- a/.github/workflows/claude-model-check.yml +++ b/.github/workflows/claude-model-check.yml @@ -24,7 +24,7 @@ jobs: - name: Claude Model Validation uses: anthropics/claude-code-action@v1 with: - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} github_token: ${{ secrets.GITHUB_TOKEN }} prompt: "/model-check" claude_args: | diff --git a/.github/workflows/claude-notebook-review.yml b/.github/workflows/claude-notebook-review.yml index 1efb35e..6a38477 100644 --- a/.github/workflows/claude-notebook-review.yml +++ b/.github/workflows/claude-notebook-review.yml @@ -25,7 +25,7 @@ jobs: - name: Run Claude Notebook Review uses: anthropics/claude-code-action@v1 with: - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} github_token: ${{ secrets.GITHUB_TOKEN }} prompt: "/notebook-review" claude_args: | diff --git a/.github/workflows/notebook-quality.yml b/.github/workflows/notebook-quality.yml index f1b6269..46ad66b 100644 --- a/.github/workflows/notebook-quality.yml +++ b/.github/workflows/notebook-quality.yml @@ -57,7 +57,7 @@ jobs: if: github.event_name == 'pull_request' && steps.validate.outputs.has_issues == 'true' uses: anthropics/claude-code-action@v1 with: - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} github_token: ${{ secrets.GITHUB_TOKEN }} prompt: | The notebook validation found these issues: @@ -88,7 +88,7 @@ jobs: github.event.pull_request.author_association == 'MEMBER' || github.event.pull_request.author_association == 'OWNER' env: - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} run: | mkdir -p test_outputs for notebook in $(find . -name "*.ipynb" -not -path "*/.*" -not -path "*/test_outputs/*"); do diff --git a/skills/contextual-embeddings/guide.ipynb b/skills/contextual-embeddings/guide.ipynb index c9e674a..6c1c9e6 100644 --- a/skills/contextual-embeddings/guide.ipynb +++ b/skills/contextual-embeddings/guide.ipynb @@ -532,202 +532,10 @@ }, { "cell_type": "code", - "execution_count": 318, + "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "import os\n", - "import pickle\n", - "import json\n", - "import numpy as np\n", - "import voyageai\n", - "from typing import List, Dict, Any\n", - "from tqdm import tqdm\n", - "import anthropic\n", - "import threading\n", - "import time\n", - "from concurrent.futures import ThreadPoolExecutor, as_completed\n", - "\n", - "class ContextualVectorDB:\n", - " def __init__(self, name: str, voyage_api_key=None, ANTHROPIC_API_KEY=None):\n", - " if voyage_api_key is None:\n", - " voyage_api_key = os.getenv(\"VOYAGE_API_KEY\")\n", - " if ANTHROPIC_API_KEY is None:\n", - " ANTHROPIC_API_KEY = os.getenv(\"ANTHROPIC_API_KEY\")\n", - " \n", - " self.voyage_client = voyageai.Client(api_key=voyage_api_key)\n", - " self.anthropic_client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)\n", - " self.name = name\n", - " self.embeddings = []\n", - " self.metadata = []\n", - " self.query_cache = {}\n", - " self.db_path = f\"./data/{name}/contextual_vector_db.pkl\"\n", - "\n", - " self.token_counts = {\n", - " 'input': 0,\n", - " 'output': 0,\n", - " 'cache_read': 0,\n", - " 'cache_creation': 0\n", - " }\n", - " self.token_lock = threading.Lock()\n", - "\n", - " def situate_context(self, doc: str, chunk: str) -> tuple[str, Any]:\n", - " DOCUMENT_CONTEXT_PROMPT = \"\"\"\n", - " \n", - " {doc_content}\n", - " \n", - " \"\"\"\n", - "\n", - " CHUNK_CONTEXT_PROMPT = \"\"\"\n", - " Here is the chunk we want to situate within the whole document\n", - " \n", - " {chunk_content}\n", - " \n", - "\n", - " Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk.\n", - " Answer only with the succinct context and nothing else.\n", - " \"\"\"\n", - "\n", - " response = self.anthropic_client.beta.prompt_caching.messages.create(\n", - " model=\"claude-3-haiku-20240307\",\n", - " max_tokens=1000,\n", - " temperature=0.0,\n", - " messages=[\n", - " {\n", - " \"role\": \"user\", \n", - " \"content\": [\n", - " {\n", - " \"type\": \"text\",\n", - " \"text\": DOCUMENT_CONTEXT_PROMPT.format(doc_content=doc),\n", - " \"cache_control\": {\"type\": \"ephemeral\"} #we will make use of prompt caching for the full documents\n", - " },\n", - " {\n", - " \"type\": \"text\",\n", - " \"text\": CHUNK_CONTEXT_PROMPT.format(chunk_content=chunk),\n", - " },\n", - " ]\n", - " },\n", - " ],\n", - " extra_headers={\"anthropic-beta\": \"prompt-caching-2024-07-31\"}\n", - " )\n", - " return response.content[0].text, response.usage\n", - "\n", - " def load_data(self, dataset: List[Dict[str, Any]], parallel_threads: int = 1):\n", - " if self.embeddings and self.metadata:\n", - " print(\"Vector database is already loaded. Skipping data loading.\")\n", - " return\n", - " if os.path.exists(self.db_path):\n", - " print(\"Loading vector database from disk.\")\n", - " self.load_db()\n", - " return\n", - "\n", - " texts_to_embed = []\n", - " metadata = []\n", - " total_chunks = sum(len(doc['chunks']) for doc in dataset)\n", - "\n", - " def process_chunk(doc, chunk):\n", - " #for each chunk, produce the context\n", - " contextualized_text, usage = self.situate_context(doc['content'], chunk['content'])\n", - " with self.token_lock:\n", - " self.token_counts['input'] += usage.input_tokens\n", - " self.token_counts['output'] += usage.output_tokens\n", - " self.token_counts['cache_read'] += usage.cache_read_input_tokens\n", - " self.token_counts['cache_creation'] += usage.cache_creation_input_tokens\n", - " \n", - " return {\n", - " #append the context to the original text chunk\n", - " 'text_to_embed': f\"{chunk['content']}\\n\\n{contextualized_text}\",\n", - " 'metadata': {\n", - " 'doc_id': doc['doc_id'],\n", - " 'original_uuid': doc['original_uuid'],\n", - " 'chunk_id': chunk['chunk_id'],\n", - " 'original_index': chunk['original_index'],\n", - " 'original_content': chunk['content'],\n", - " 'contextualized_content': contextualized_text\n", - " }\n", - " }\n", - "\n", - " print(f\"Processing {total_chunks} chunks with {parallel_threads} threads\")\n", - " with ThreadPoolExecutor(max_workers=parallel_threads) as executor:\n", - " futures = []\n", - " for doc in dataset:\n", - " for chunk in doc['chunks']:\n", - " futures.append(executor.submit(process_chunk, doc, chunk))\n", - " \n", - " for future in tqdm(as_completed(futures), total=total_chunks, desc=\"Processing chunks\"):\n", - " result = future.result()\n", - " texts_to_embed.append(result['text_to_embed'])\n", - " metadata.append(result['metadata'])\n", - "\n", - " self._embed_and_store(texts_to_embed, metadata)\n", - " self.save_db()\n", - "\n", - " #logging token usage\n", - " print(f\"Contextual Vector database loaded and saved. Total chunks processed: {len(texts_to_embed)}\")\n", - " print(f\"Total input tokens without caching: {self.token_counts['input']}\")\n", - " print(f\"Total output tokens: {self.token_counts['output']}\")\n", - " print(f\"Total input tokens written to cache: {self.token_counts['cache_creation']}\")\n", - " print(f\"Total input tokens read from cache: {self.token_counts['cache_read']}\")\n", - " \n", - " total_tokens = self.token_counts['input'] + self.token_counts['cache_read'] + self.token_counts['cache_creation']\n", - " savings_percentage = (self.token_counts['cache_read'] / total_tokens) * 100 if total_tokens > 0 else 0\n", - " print(f\"Total input token savings from prompt caching: {savings_percentage:.2f}% of all input tokens used were read from cache.\")\n", - " print(\"Tokens read from cache come at a 90 percent discount!\")\n", - "\n", - " #we use voyage AI here for embeddings. Read more here: https://docs.voyageai.com/docs/embeddings\n", - " def _embed_and_store(self, texts: List[str], data: List[Dict[str, Any]]):\n", - " batch_size = 128\n", - " result = [\n", - " self.voyage_client.embed(\n", - " texts[i : i + batch_size],\n", - " model=\"voyage-2\"\n", - " ).embeddings\n", - " for i in range(0, len(texts), batch_size)\n", - " ]\n", - " self.embeddings = [embedding for batch in result for embedding in batch]\n", - " self.metadata = data\n", - "\n", - " def search(self, query: str, k: int = 20) -> List[Dict[str, Any]]:\n", - " if query in self.query_cache:\n", - " query_embedding = self.query_cache[query]\n", - " else:\n", - " query_embedding = self.voyage_client.embed([query], model=\"voyage-2\").embeddings[0]\n", - " self.query_cache[query] = query_embedding\n", - "\n", - " if not self.embeddings:\n", - " raise ValueError(\"No data loaded in the vector database.\")\n", - "\n", - " similarities = np.dot(self.embeddings, query_embedding)\n", - " top_indices = np.argsort(similarities)[::-1][:k]\n", - " \n", - " top_results = []\n", - " for idx in top_indices:\n", - " result = {\n", - " \"metadata\": self.metadata[idx],\n", - " \"similarity\": float(similarities[idx]),\n", - " }\n", - " top_results.append(result)\n", - " return top_results\n", - "\n", - " def save_db(self):\n", - " data = {\n", - " \"embeddings\": self.embeddings,\n", - " \"metadata\": self.metadata,\n", - " \"query_cache\": json.dumps(self.query_cache),\n", - " }\n", - " os.makedirs(os.path.dirname(self.db_path), exist_ok=True)\n", - " with open(self.db_path, \"wb\") as file:\n", - " pickle.dump(data, file)\n", - "\n", - " def load_db(self):\n", - " if not os.path.exists(self.db_path):\n", - " raise ValueError(\"Vector database file not found. Use load_data to create a new database.\")\n", - " with open(self.db_path, \"rb\") as file:\n", - " data = pickle.load(file)\n", - " self.embeddings = data[\"embeddings\"]\n", - " self.metadata = data[\"metadata\"]\n", - " self.query_cache = json.loads(data[\"query_cache\"])" - ] + "source": "import os\nimport pickle\nimport json\nimport numpy as np\nimport voyageai\nfrom typing import List, Dict, Any\nfrom tqdm import tqdm\nimport anthropic\nimport threading\nimport time\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\n\nclass ContextualVectorDB:\n def __init__(self, name: str, voyage_api_key=None, anthropic_api_key=None):\n if voyage_api_key is None:\n voyage_api_key = os.getenv(\"VOYAGE_API_KEY\")\n if anthropic_api_key is None:\n anthropic_api_key = os.getenv(\"ANTHROPIC_API_KEY\")\n \n self.voyage_client = voyageai.Client(api_key=voyage_api_key)\n self.anthropic_client = anthropic.Anthropic(api_key=anthropic_api_key)\n self.name = name\n self.embeddings = []\n self.metadata = []\n self.query_cache = {}\n self.db_path = f\"./data/{name}/contextual_vector_db.pkl\"\n\n self.token_counts = {\n 'input': 0,\n 'output': 0,\n 'cache_read': 0,\n 'cache_creation': 0\n }\n self.token_lock = threading.Lock()\n\n def situate_context(self, doc: str, chunk: str) -> tuple[str, Any]:\n DOCUMENT_CONTEXT_PROMPT = \"\"\"\n \n {doc_content}\n \n \"\"\"\n\n CHUNK_CONTEXT_PROMPT = \"\"\"\n Here is the chunk we want to situate within the whole document\n \n {chunk_content}\n \n\n Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk.\n Answer only with the succinct context and nothing else.\n \"\"\"\n\n response = self.anthropic_client.beta.prompt_caching.messages.create(\n model=\"claude-3-haiku-20240307\",\n max_tokens=1000,\n temperature=0.0,\n messages=[\n {\n \"role\": \"user\", \n \"content\": [\n {\n \"type\": \"text\",\n \"text\": DOCUMENT_CONTEXT_PROMPT.format(doc_content=doc),\n \"cache_control\": {\"type\": \"ephemeral\"} #we will make use of prompt caching for the full documents\n },\n {\n \"type\": \"text\",\n \"text\": CHUNK_CONTEXT_PROMPT.format(chunk_content=chunk),\n },\n ]\n },\n ],\n extra_headers={\"anthropic-beta\": \"prompt-caching-2024-07-31\"}\n )\n return response.content[0].text, response.usage\n\n def load_data(self, dataset: List[Dict[str, Any]], parallel_threads: int = 1):\n if self.embeddings and self.metadata:\n print(\"Vector database is already loaded. Skipping data loading.\")\n return\n if os.path.exists(self.db_path):\n print(\"Loading vector database from disk.\")\n self.load_db()\n return\n\n texts_to_embed = []\n metadata = []\n total_chunks = sum(len(doc['chunks']) for doc in dataset)\n\n def process_chunk(doc, chunk):\n #for each chunk, produce the context\n contextualized_text, usage = self.situate_context(doc['content'], chunk['content'])\n with self.token_lock:\n self.token_counts['input'] += usage.input_tokens\n self.token_counts['output'] += usage.output_tokens\n self.token_counts['cache_read'] += usage.cache_read_input_tokens\n self.token_counts['cache_creation'] += usage.cache_creation_input_tokens\n \n return {\n #append the context to the original text chunk\n 'text_to_embed': f\"{chunk['content']}\\n\\n{contextualized_text}\",\n 'metadata': {\n 'doc_id': doc['doc_id'],\n 'original_uuid': doc['original_uuid'],\n 'chunk_id': chunk['chunk_id'],\n 'original_index': chunk['original_index'],\n 'original_content': chunk['content'],\n 'contextualized_content': contextualized_text\n }\n }\n\n print(f\"Processing {total_chunks} chunks with {parallel_threads} threads\")\n with ThreadPoolExecutor(max_workers=parallel_threads) as executor:\n futures = []\n for doc in dataset:\n for chunk in doc['chunks']:\n futures.append(executor.submit(process_chunk, doc, chunk))\n \n for future in tqdm(as_completed(futures), total=total_chunks, desc=\"Processing chunks\"):\n result = future.result()\n texts_to_embed.append(result['text_to_embed'])\n metadata.append(result['metadata'])\n\n self._embed_and_store(texts_to_embed, metadata)\n self.save_db()\n\n #logging token usage\n print(f\"Contextual Vector database loaded and saved. Total chunks processed: {len(texts_to_embed)}\")\n print(f\"Total input tokens without caching: {self.token_counts['input']}\")\n print(f\"Total output tokens: {self.token_counts['output']}\")\n print(f\"Total input tokens written to cache: {self.token_counts['cache_creation']}\")\n print(f\"Total input tokens read from cache: {self.token_counts['cache_read']}\")\n \n total_tokens = self.token_counts['input'] + self.token_counts['cache_read'] + self.token_counts['cache_creation']\n savings_percentage = (self.token_counts['cache_read'] / total_tokens) * 100 if total_tokens > 0 else 0\n print(f\"Total input token savings from prompt caching: {savings_percentage:.2f}% of all input tokens used were read from cache.\")\n print(\"Tokens read from cache come at a 90 percent discount!\")\n\n #we use voyage AI here for embeddings. Read more here: https://docs.voyageai.com/docs/embeddings\n def _embed_and_store(self, texts: List[str], data: List[Dict[str, Any]]):\n batch_size = 128\n result = [\n self.voyage_client.embed(\n texts[i : i + batch_size],\n model=\"voyage-2\"\n ).embeddings\n for i in range(0, len(texts), batch_size)\n ]\n self.embeddings = [embedding for batch in result for embedding in batch]\n self.metadata = data\n\n def search(self, query: str, k: int = 20) -> List[Dict[str, Any]]:\n if query in self.query_cache:\n query_embedding = self.query_cache[query]\n else:\n query_embedding = self.voyage_client.embed([query], model=\"voyage-2\").embeddings[0]\n self.query_cache[query] = query_embedding\n\n if not self.embeddings:\n raise ValueError(\"No data loaded in the vector database.\")\n\n similarities = np.dot(self.embeddings, query_embedding)\n top_indices = np.argsort(similarities)[::-1][:k]\n \n top_results = []\n for idx in top_indices:\n result = {\n \"metadata\": self.metadata[idx],\n \"similarity\": float(similarities[idx]),\n }\n top_results.append(result)\n return top_results\n\n def save_db(self):\n data = {\n \"embeddings\": self.embeddings,\n \"metadata\": self.metadata,\n \"query_cache\": json.dumps(self.query_cache),\n }\n os.makedirs(os.path.dirname(self.db_path), exist_ok=True)\n with open(self.db_path, \"wb\") as file:\n pickle.dump(data, file)\n\n def load_db(self):\n if not os.path.exists(self.db_path):\n raise ValueError(\"Vector database file not found. Use load_data to create a new database.\")\n with open(self.db_path, \"rb\") as file:\n data = pickle.load(file)\n self.embeddings = data[\"embeddings\"]\n self.metadata = data[\"metadata\"]\n self.query_cache = json.loads(data[\"query_cache\"])" }, { "cell_type": "code", @@ -1384,4 +1192,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file