Add PDF highlighting to using_citations cookbook (#133)

This commit is contained in:
cal-anthropic
2025-01-28 09:44:46 -08:00
committed by GitHub
parent e268dbfa41
commit 09c00ae469
3 changed files with 227 additions and 155 deletions

Binary file not shown.

View File

@@ -40,7 +40,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
@@ -84,7 +84,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 57,
"metadata": {},
"outputs": [
{
@@ -96,28 +96,29 @@
"Raw response:\n",
"================================================================================\n",
"{\n",
" \"blocks\": [\n",
" \"content\": [\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \"Based on the documentation, I can explain why you don't see tracking yet: \"\n",
" },\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \"You'll receive an email with your tracking number once your order ships. If you don't receive a tracking number within 48 hours of your order confirmation, please contact our customer support team for assistance.\",\n",
" \"citations\": [\n",
" {\n",
" \"type\": \"char_location\",\n",
" \"cited_text\": \"Once your order ships, you'll receive an email with a tracking number. \",\n",
" \"start_char_index\": 0,\n",
" \"end_char_index\": 71,\n",
" \"document_title\": \"Order Tracking Information\"\n",
" },\n",
" {\n",
" \"type\": \"char_location\",\n",
" \"cited_text\": \"If you haven't received a tracking number within 48 hours of your order confirmation, please contact our customer support team.\",\n",
" \"start_char_index\": 398,\n",
" \"end_char_index\": 525,\n",
" \"document_title\": \"Order Tracking Information\"\n",
" }\n",
" ]\n",
" },\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \"\\n\\nSince you just checked out, your order likely hasn't shipped yet. Once it ships, you'll receive the tracking information via email.\"\n",
" }\n",
" ]\n",
@@ -171,30 +172,36 @@
" ]\n",
")\n",
"\n",
"def visualize_raw_response(response):\n",
" raw_response = {\"content\": []}\n",
"\n",
" print(\"\\n\" + \"=\"*80 + \"\\nRaw response:\\n\" + \"=\"*80)\n",
"raw_response = {\n",
" \"blocks\": []\n",
"}\n",
" \n",
" for content in response.content:\n",
" if content.type == \"text\":\n",
" block = {\n",
" \"text\": content.text,\n",
" \"type\": \"text\",\n",
" \"text\": content.text\n",
" }\n",
" if hasattr(content, 'citations') and content.citations:\n",
" block[\"citations\"] = [\n",
" {\n",
" \"type\": c.type,\n",
" \"cited_text\": c.cited_text,\n",
" \"document_index\": c.document_index,\n",
" \"document_title\": c.document_title,\n",
" \"start_char_index\": c.start_char_index,\n",
" \"end_char_index\": c.end_char_index\n",
" } for c in content.citations\n",
" ]\n",
" raw_response[\"blocks\"].append(block)\n",
" block[\"citations\"] = []\n",
" for citation in content.citations:\n",
" citation_dict = {\n",
" \"type\": citation.type,\n",
" \"cited_text\": citation.cited_text,\n",
" \"document_title\": citation.document_title,\n",
" }\n",
" if citation.type == \"page_location\":\n",
" citation_dict.update({\n",
" \"start_page_number\": citation.start_page_number,\n",
" \"end_page_number\": citation.end_page_number\n",
" })\n",
" block[\"citations\"].append(citation_dict)\n",
" raw_response[\"content\"].append(block)\n",
" \n",
"print(json.dumps(raw_response, indent=2))"
" return json.dumps(raw_response, indent=2)\n",
"\n",
"print(visualize_raw_response(response))"
]
},
{
@@ -218,13 +225,17 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 58,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"================================================================================\n",
"Formatted response:\n",
"================================================================================\n",
"Based on the documentation, I can explain why you don't see tracking yet: You'll receive an email with your tracking number once your order ships. If you don't receive a tracking number within 48 hours of your order confirmation, please contact our customer support team for assistance. [1] [2]\n",
"\n",
"Since you just checked out, your order likely hasn't shipped yet. Once it ships, you'll receive the tracking information via email.\n",
@@ -249,13 +260,23 @@
" formatted_text = \"\"\n",
" citations_list = []\n",
"\n",
" print(\"\\n\" + \"=\"*80 + \"\\nFormatted response:\\n\" + \"=\"*80)\n",
" \n",
" for content in response.content:\n",
" if content.type == \"text\":\n",
" text = content.text\n",
" if hasattr(content, 'citations') and content.citations:\n",
" # Sort citations by their appearance in the text\n",
" sorted_citations = sorted(content.citations, \n",
" key=lambda x: x.start_char_index)\n",
" def get_sort_key(citation):\n",
" if hasattr(citation, 'start_char_index'):\n",
" return citation.start_char_index\n",
" elif hasattr(citation, 'start_page_number'):\n",
" return citation.start_page_number\n",
" elif hasattr(citation, 'start_block_index'):\n",
" return citation.start_block_index\n",
" return 0 # fallback\n",
"\n",
" sorted_citations = sorted(content.citations, key=get_sort_key)\n",
" \n",
" # Process each citation\n",
" for citation in sorted_citations:\n",
@@ -306,7 +327,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 59,
"metadata": {},
"outputs": [
{
@@ -321,16 +342,15 @@
" \"content\": [\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \"Based on the paper, here are the key aspects of Constitutional AI (CAI):\\n\\n\"\n",
" \"text\": \"Based on the paper, here are the key aspects of Constitutional AI:\\n\\n\"\n",
" },\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \"Constitutional AI is a method for training a harmless AI assistant through self-improvement, without any human labels identifying harmful outputs. The only human oversight is provided through a list of rules or principles, which is why it's called 'Constitutional' AI.\",\n",
" \"text\": \"Constitutional AI is a method for training a harmless AI assistant through self-improvement, without any human labels identifying harmful outputs. The only human oversight is provided through a list of rules or principles, hence the name \\\"Constitutional AI\\\".\",\n",
" \"citations\": [\n",
" {\n",
" \"type\": \"page_location\",\n",
" \"cited_text\": \"We experiment with methods for training a harmless AI assistant through self\\u0002improvement, without any human labels identifying harmful outputs. The only human\\r\\noversight is provided through a list of rules or principles, and so we refer to the method as\\r\\n\\u2018Constitutional AI\\u2019. \",\n",
" \"document_index\": 0,\n",
" \"document_title\": \"Constitutional AI Paper\",\n",
" \"start_page_number\": 1,\n",
" \"end_page_number\": 2\n",
@@ -339,7 +359,7 @@
" },\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \"\\n\\nThe process involves two main stages:\\n\\n1. Supervised Learning Phase:\\n\"\n",
" \"text\": \"\\n\\nThe process involves two main phases:\\n\\n1. Supervised Learning Phase:\\n\"\n",
" },\n",
" {\n",
" \"type\": \"text\",\n",
@@ -348,7 +368,6 @@
" {\n",
" \"type\": \"page_location\",\n",
" \"cited_text\": \"In the supervised phase we sample from an initial model, then generate\\r\\nself-critiques and revisions, and then finetune the original model on revised responses. \",\n",
" \"document_index\": 0,\n",
" \"document_title\": \"Constitutional AI Paper\",\n",
" \"start_page_number\": 1,\n",
" \"end_page_number\": 2\n",
@@ -366,7 +385,6 @@
" {\n",
" \"type\": \"page_location\",\n",
" \"cited_text\": \"In\\r\\nthe RL phase, we sample from the finetuned model, use a model to evaluate which of the\\r\\ntwo samples is better, and then train a preference model from this dataset of AI prefer\\u0002ences. We then train with RL using the preference model as the reward signal, i.e. we\\r\\nuse \\u2018RL from AI Feedback\\u2019 (RLAIF). \",\n",
" \"document_index\": 0,\n",
" \"document_title\": \"Constitutional AI Paper\",\n",
" \"start_page_number\": 1,\n",
" \"end_page_number\": 2\n",
@@ -384,7 +402,6 @@
" {\n",
" \"type\": \"page_location\",\n",
" \"cited_text\": \"As a result we are able to train a harmless but non\\u0002evasive AI assistant that engages with harmful queries by explaining its objections to them.\\r\\nBoth the SL and RL methods can leverage chain-of-thought style reasoning to improve the\\r\\nhuman-judged performance and transparency of AI decision making. These methods make\\r\\nit possible to control AI behavior more precisely and with far fewer human labels.\\r\\n\",\n",
" \"document_index\": 0,\n",
" \"document_title\": \"Constitutional AI Paper\",\n",
" \"start_page_number\": 1,\n",
" \"end_page_number\": 2\n",
@@ -397,12 +414,11 @@
" },\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \"The ultimate goal is not to completely remove human supervision, but rather to make it more efficient, transparent and targeted. While this work reduces reliance on human supervision for harmlessness, they still relied on human supervision in the form of helpfulness labels. They expect it is possible to achieve helpfulness and instruction-following without human feedback, starting from only a pretrained LM and extensive prompting, but leave this for future work.\",\n",
" \"text\": \"The ultimate goal is not to completely remove human supervision, but rather to make it more efficient, transparent and targeted. While this work reduces reliance on human supervision for harmlessness, they still relied on human supervision in the form of helpfulness labels. The researchers expect it is possible to achieve helpfulness and instruction-following without human feedback, starting from only a pretrained LM and extensive prompting, but leave this for future work.\",\n",
" \"citations\": [\n",
" {\n",
" \"type\": \"page_location\",\n",
" \"cited_text\": \"By removing human feedback labels for harmlessness, we have moved further away from reliance on human\\r\\nsupervision, and closer to the possibility of a self-supervised approach to alignment. However, in this work\\r\\nwe still relied on human supervision in the form of helpfulness labels. We expect it is possible to achieve help\\u0002fulness and instruction-following without human feedback, starting from only a pretrained LM and extensive\\r\\nprompting, but we leave this for future work.\\r\\nOur ultimate goal is not to remove human supervision entirely, but to make it more efficient, transparent, and\\r\\ntargeted. \",\n",
" \"document_index\": 0,\n",
" \"document_title\": \"Constitutional AI Paper\",\n",
" \"start_page_number\": 15,\n",
" \"end_page_number\": 16\n",
@@ -415,11 +431,11 @@
"================================================================================\n",
"Formatted response:\n",
"================================================================================\n",
"Based on the paper, here are the key aspects of Constitutional AI (CAI):\n",
"Based on the paper, here are the key aspects of Constitutional AI:\n",
"\n",
"Constitutional AI is a method for training a harmless AI assistant through self-improvement, without any human labels identifying harmful outputs. The only human oversight is provided through a list of rules or principles, which is why it's called 'Constitutional' AI. [1]\n",
"Constitutional AI is a method for training a harmless AI assistant through self-improvement, without any human labels identifying harmful outputs. The only human oversight is provided through a list of rules or principles, hence the name \"Constitutional AI\". [1]\n",
"\n",
"The process involves two main stages:\n",
"The process involves two main phases:\n",
"\n",
"1. Supervised Learning Phase:\n",
"In this phase, they sample from an initial model, generate self-critiques and revisions, and then finetune the original model on revised responses. [2]\n",
@@ -437,7 +453,7 @@
"- Both the SL and RL methods can leverage chain-of-thought style reasoning to improve human-judged performance and transparency of AI decision making\n",
"- These methods make it possible to control AI behavior more precisely and with far fewer human labels [4]\n",
"\n",
"The ultimate goal is not to completely remove human supervision, but rather to make it more efficient, transparent and targeted. While this work reduces reliance on human supervision for harmlessness, they still relied on human supervision in the form of helpfulness labels. They expect it is possible to achieve helpfulness and instruction-following without human feedback, starting from only a pretrained LM and extensive prompting, but leave this for future work. [5]\n",
"The ultimate goal is not to completely remove human supervision, but rather to make it more efficient, transparent and targeted. While this work reduces reliance on human supervision for harmlessness, they still relied on human supervision in the form of helpfulness labels. The researchers expect it is possible to achieve helpfulness and instruction-following without human feedback, starting from only a pretrained LM and extensive prompting, but leave this for future work. [5]\n",
"\n",
"[1] \"We experiment with methods for training a harmless AI assistant through self\u0002improvement, without any human labels identifying harmful outputs. The only human oversight is provided through a list of rules or principles, and so we refer to the method as Constitutional AI.\" found in \"Constitutional AI Paper\"\n",
"[2] \"In the supervised phase we sample from an initial model, then generate self-critiques and revisions, and then finetune the original model on revised responses.\" found in \"Constitutional AI Paper\"\n",
@@ -456,7 +472,7 @@
"with open(pdf_path, \"rb\") as f:\n",
" pdf_data = base64.b64encode(f.read()).decode()\n",
"\n",
"response = client.messages.create(\n",
"pdf_response = client.messages.create(\n",
" model=\"claude-3-5-sonnet-latest\",\n",
" temperature=0.0,\n",
" max_tokens=1024,\n",
@@ -483,34 +499,8 @@
" ]\n",
")\n",
"\n",
"print(\"\\n\" + \"=\"*80 + \"\\nRaw response:\\n\" + \"=\"*80)\n",
"\n",
"# Convert response to a dictionary format\n",
"raw_response = {\"content\": []}\n",
"\n",
"for content in response.content:\n",
" if content.type == \"text\":\n",
" block = {\n",
" \"type\": \"text\",\n",
" \"text\": content.text\n",
" }\n",
" if hasattr(content, 'citations') and content.citations:\n",
" block[\"citations\"] = [\n",
" {\n",
" \"type\": c.type,\n",
" \"cited_text\": c.cited_text,\n",
" \"document_index\": c.document_index,\n",
" \"document_title\": c.document_title,\n",
" \"start_char_index\": c.start_char_index,\n",
" \"end_char_index\": c.end_char_index\n",
" } for c in content.citations\n",
" ]\n",
" raw_response[\"content\"].append(block)\n",
"\n",
"print(json.dumps(raw_response, indent=2))\n",
"formatted_response = visualize_citations(response)\n",
"print(\"\\n\" + \"=\"*80 + \"\\nFormatted response:\\n\" + \"=\"*80)\n",
"print(formatted_response)"
"print(visualize_raw_response(pdf_response))\n",
"print(visualize_citations(pdf_response))"
]
},
{
@@ -530,7 +520,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 60,
"metadata": {},
"outputs": [
{
@@ -550,10 +540,7 @@
" {\n",
" \"type\": \"content_block_location\",\n",
" \"cited_text\": \"Once your order ships, you'll receive an email with a tracking number. To track your package, log in to your PetWorld account and go to \\\"Order History.\\\" Click on the order you want to track and select \\\"Track Package.\\\" This will show you the current status and estimated delivery date. You can also enter the tracking number directly on our shipping partner's website for more detailed information. If you haven't received a tracking number within 48 hours of your order confirmation, please contact our customer support team.\",\n",
" \"document_index\": 3,\n",
" \"document_title\": \"Order Tracking Information\",\n",
" \"start_block_index\": 0,\n",
" \"end_block_index\": 1\n",
" \"document_title\": \"Order Tracking Information\"\n",
" }\n",
" ]\n",
" }\n",
@@ -596,7 +583,7 @@
"\n",
"QUESTION = \"I just checked out, where is my order tracking number? Track package is not available on the website yet for my order.\"\n",
"\n",
"response = client.messages.create(\n",
"custom_content_response = client.messages.create(\n",
" model=\"claude-3-5-sonnet-latest\",\n",
" temperature=0.0,\n",
" max_tokens=1024,\n",
@@ -613,34 +600,8 @@
" ]\n",
")\n",
"\n",
"print(\"\\n\" + \"=\"*80 + \"\\nRaw response:\\n\" + \"=\"*80)\n",
"raw_response = {\n",
" \"content\": []\n",
"}\n",
"\n",
"for content in response.content:\n",
" if content.type == \"text\":\n",
" block = {\n",
" \"type\": \"text\",\n",
" \"text\": content.text\n",
" }\n",
" if hasattr(content, 'citations') and content.citations:\n",
" block[\"citations\"] = [\n",
" {\n",
" \"type\": c.type,\n",
" \"cited_text\": c.cited_text,\n",
" \"document_index\": c.document_index,\n",
" \"document_title\": c.document_title,\n",
" \"start_char_index\": c.start_char_index,\n",
" \"end_char_index\": c.end_char_index\n",
" } for c in content.citations\n",
" ]\n",
" raw_response[\"content\"].append(block)\n",
"\n",
"print(json.dumps(raw_response, indent=2))\n",
"formatted_response = visualize_citations(response)\n",
"print(\"\\n\" + \"=\"*80 + \"\\nFormatted response:\\n\" + \"=\"*80)\n",
"print(formatted_response)"
"print(visualize_raw_response(custom_content_response))\n",
"print(visualize_citations(custom_content_response))"
]
},
{
@@ -660,7 +621,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 61,
"metadata": {},
"outputs": [
{
@@ -675,19 +636,16 @@
" \"content\": [\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \"Let me explain PetWorld's loyalty program:\\n\\n\"\n",
" \"text\": \"Let me explain PetWorld's loyalty program based on the provided information:\\n\\n\"\n",
" },\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \"The program works by awarding 1 point for every dollar you spend at PetWorld. Once you collect 100 points, you'll receive a $5 reward that you can redeem on your next purchase.\",\n",
" \"text\": \"PetWorld's loyalty program is straightforward - you earn 1 point for every dollar you spend. These points can be redeemed once you reach 100 points, which will get you a $5 reward that you can use on your next purchase.\",\n",
" \"citations\": [\n",
" {\n",
" \"type\": \"char_location\",\n",
" \"cited_text\": \"PetWorld offers a loyalty program where customers earn 1 point for every dollar spent. Once you accumulate 100 points, you'll receive a $5 reward that can be used on your next purchase. \",\n",
" \"document_index\": 0,\n",
" \"document_title\": \"Loyalty Program Details\",\n",
" \"start_char_index\": 0,\n",
" \"end_char_index\": 186\n",
" \"document_title\": \"Loyalty Program Details\"\n",
" }\n",
" ]\n",
" },\n",
@@ -702,10 +660,7 @@
" {\n",
" \"type\": \"char_location\",\n",
" \"cited_text\": \"Points expire 12 months after they are earned. \",\n",
" \"document_index\": 0,\n",
" \"document_title\": \"Loyalty Program Details\",\n",
" \"start_char_index\": 186,\n",
" \"end_char_index\": 233\n",
" \"document_title\": \"Loyalty Program Details\"\n",
" }\n",
" ]\n",
" },\n",
@@ -715,21 +670,18 @@
" },\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \"You can easily monitor your point balance by either checking your account dashboard or contacting customer service.\",\n",
" \"text\": \"You can easily keep track of your points by either checking your account dashboard or contacting customer service.\",\n",
" \"citations\": [\n",
" {\n",
" \"type\": \"char_location\",\n",
" \"cited_text\": \"You can check your point balance in your account dashboard or by asking customer service.\",\n",
" \"document_index\": 0,\n",
" \"document_title\": \"Loyalty Program Details\",\n",
" \"start_char_index\": 233,\n",
" \"end_char_index\": 322\n",
" \"document_title\": \"Loyalty Program Details\"\n",
" }\n",
" ]\n",
" },\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \"\\n\\nPlease note that this information comes from an article that hasn't been updated in 12 months, so some details may have changed. I recommend verifying the current terms with PetWorld directly.\"\n",
" \"text\": \"\\n\\nPlease note that since this information is from an article that hasn't been updated in 12 months, some details of the program may have changed. It would be best to verify the current terms with PetWorld directly.\"\n",
" }\n",
" ]\n",
"}\n",
@@ -737,15 +689,15 @@
"================================================================================\n",
"Formatted response:\n",
"================================================================================\n",
"Let me explain PetWorld's loyalty program:\n",
"Let me explain PetWorld's loyalty program based on the provided information:\n",
"\n",
"The program works by awarding 1 point for every dollar you spend at PetWorld. Once you collect 100 points, you'll receive a $5 reward that you can redeem on your next purchase. [1]\n",
"PetWorld's loyalty program is straightforward - you earn 1 point for every dollar you spend. These points can be redeemed once you reach 100 points, which will get you a $5 reward that you can use on your next purchase. [1]\n",
"\n",
"Points have an expiration period of 12 months from the date they are earned. [2]\n",
"\n",
"You can easily monitor your point balance by either checking your account dashboard or contacting customer service. [3]\n",
"You can easily keep track of your points by either checking your account dashboard or contacting customer service. [3]\n",
"\n",
"Please note that this information comes from an article that hasn't been updated in 12 months, so some details may have changed. I recommend verifying the current terms with PetWorld directly.\n",
"Please note that since this information is from an article that hasn't been updated in 12 months, some details of the program may have changed. It would be best to verify the current terms with PetWorld directly.\n",
"\n",
"[1] \"PetWorld offers a loyalty program where customers earn 1 point for every dollar spent. Once you accumulate 100 points, you'll receive a $5 reward that can be used on your next purchase.\" found in \"Loyalty Program Details\"\n",
"[2] \"Points expire 12 months after they are earned.\" found in \"Loyalty Program Details\"\n",
@@ -771,7 +723,7 @@
"\n",
"QUESTION = \"How does PetWorld's loyalty program work? When do points expire?\"\n",
"\n",
"response = client.messages.create(\n",
"context_response = client.messages.create(\n",
" model=\"claude-3-5-sonnet-latest\",\n",
" temperature=0.0,\n",
" max_tokens=1024,\n",
@@ -789,34 +741,154 @@
" ]\n",
")\n",
"\n",
"print(\"\\n\" + \"=\"*80 + \"\\nRaw response:\\n\" + \"=\"*80)\n",
"raw_response = {\n",
" \"content\": []\n",
"}\n",
"print(visualize_raw_response(context_response))\n",
"print(visualize_citations(context_response))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### PDF Highlighting\n",
"\n",
"for content in response.content:\n",
" if content.type == \"text\":\n",
" block = {\n",
" \"type\": \"text\",\n",
" \"text\": content.text\n",
" }\n",
" if hasattr(content, 'citations') and content.citations:\n",
" block[\"citations\"] = [\n",
"One limitation with PDF citations is only the page numbers are returned. You can use third party libraries to match the returned cited text with page contents to draw attention to the cited content. This cell demonstrates PDF citation highlighting using Claude and PyMuPDF, creating a new annotated PDF:"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"================================================================================\n",
"Raw response:\n",
"================================================================================\n",
"{\n",
" \"type\": c.type,\n",
" \"cited_text\": c.cited_text,\n",
" \"document_index\": c.document_index,\n",
" \"document_title\": c.document_title,\n",
" \"start_char_index\": c.start_char_index,\n",
" \"end_char_index\": c.end_char_index\n",
" } for c in content.citations\n",
" \"content\": [\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \"According to the letter, \"\n",
" },\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \"Amazon's total revenue grew 12% year-over-year (\\\"YoY\\\") from $514B to $575B in 2023\",\n",
" \"citations\": [\n",
" {\n",
" \"type\": \"page_location\",\n",
" \"cited_text\": \"In 2023, Amazon\\u2019s total revenue grew 12% year-over-year (\\u201cYoY\\u201d) from $514B to $575B. \",\n",
" \"document_title\": \"Amazon 2023 Shareholder Letter\",\n",
" \"start_page_number\": 1,\n",
" \"end_page_number\": 2\n",
" }\n",
" ]\n",
" raw_response[\"content\"].append(block)\n",
" },\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \".\\n\\nBreaking this down by segment:\\n\"\n",
" },\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \"\\n- North America revenue increased 12% YoY from $316B to $353B\\n- International revenue grew 11% YoY from $118B to $131B \\n- AWS revenue increased 13% YoY from $80B to $91B\",\n",
" \"citations\": [\n",
" {\n",
" \"type\": \"page_location\",\n",
" \"cited_text\": \"By segment, North\\r\\nAmerica revenue increased 12% YoY from $316B to $353B, International revenue grew 11% YoY from\\r\\n$118B to $131B, and AWS revenue increased 13% YoY from $80B to $91B.\\r\\n\",\n",
" \"document_title\": \"Amazon 2023 Shareholder Letter\",\n",
" \"start_page_number\": 1,\n",
" \"end_page_number\": 2\n",
" }\n",
" ]\n",
" }\n",
" ]\n",
"}\n",
"Found cited text on page 1\n",
"Found cited text on page 1\n",
"\n",
"print(json.dumps(raw_response, indent=2))\n",
"formatted_response = visualize_citations(response)\n",
"print(\"\\n\" + \"=\"*80 + \"\\nFormatted response:\\n\" + \"=\"*80)\n",
"print(formatted_response)"
"Created highlighted PDF at: data/Amazon-com-Inc-2023-Shareholder-Letter-highlighted.pdf\n"
]
}
],
"source": [
"import fitz # PyMuPDF\n",
"\n",
"# Setup paths and read PDF\n",
"pdf_path = 'data/Amazon-com-Inc-2023-Shareholder-Letter.pdf'\n",
"output_pdf_path = 'data/Amazon-com-Inc-2023-Shareholder-Letter-highlighted.pdf'\n",
"\n",
"# Read and encode the PDF\n",
"with open(pdf_path, \"rb\") as f:\n",
" pdf_data = base64.b64encode(f.read()).decode()\n",
"\n",
"response = client.messages.create(\n",
" model=\"claude-3-5-sonnet-latest\",\n",
" max_tokens=1024,\n",
" temperature=0,\n",
" messages=[\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": [\n",
" {\n",
" \"type\": \"document\",\n",
" \"source\": {\n",
" \"type\": \"base64\",\n",
" \"media_type\": \"application/pdf\",\n",
" \"data\": pdf_data\n",
" },\n",
" \"title\": \"Amazon 2023 Shareholder Letter\",\n",
" \"citations\": {\"enabled\": True}\n",
" },\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \"What was Amazon's total revenue in 2023 and how much did it grow year-over-year?\"\n",
" }\n",
" ]\n",
" }\n",
" ]\n",
")\n",
"\n",
"print(visualize_raw_response(response))\n",
"\n",
"# Collect PDF citations\n",
"pdf_citations = []\n",
"for content in response.content:\n",
" if hasattr(content, 'citations') and content.citations:\n",
" for citation in content.citations:\n",
" if citation.type == \"page_location\":\n",
" pdf_citations.append(citation)\n",
"\n",
"doc = fitz.open(pdf_path)\n",
"\n",
"# Process each citation\n",
"for citation in pdf_citations:\n",
" if citation.type == \"page_location\":\n",
" text_to_find = citation.cited_text.replace('\\u0002', '')\n",
" start_page = citation.start_page_number - 1 # Convert to 0-based index\n",
" end_page = citation.end_page_number - 2\n",
" \n",
" # Process each page in the citation range\n",
" for page_num in range(start_page, end_page + 1):\n",
" page = doc[page_num]\n",
" \n",
" text_instances = page.search_for(text_to_find.strip())\n",
" \n",
" if text_instances:\n",
" print(f\"Found cited text on page {page_num + 1}\")\n",
" for inst in text_instances:\n",
" highlight = page.add_highlight_annot(inst)\n",
" highlight.set_colors({\"stroke\":(1, 1, 0)}) # Yellow highlight\n",
" highlight.update()\n",
" else:\n",
" print(f\"{text_to_find} not found on page {page_num + 1}\")\n",
"\n",
"# Save the new PDF\n",
"doc.save(output_pdf_path)\n",
"doc.close()\n",
"\n",
"print(f\"\\nCreated highlighted PDF at: {output_pdf_path}\")"
]
}
],