Add caching in Python

Still need it in JS
2023-08-11 19:02:35 -07:00
parent 8ed47eb4dd
commit d7cff0f52e
5 changed files with 115 additions and 28 deletions
--- a/app/src/components/dashboard/LoggedCallTable.tsx
+++ b/app/src/components/dashboard/LoggedCallTable.tsx
@@ -99,6 +99,14 @@ function TableRow({
    [loggedCall.tags],
  );

+  const durationCell = (
+    <Td isNumeric>
+      {loggedCall.cacheHit
+        ? "Cache hit"
+        : ((loggedCall.modelResponse?.durationMs ?? 0) / 1000).toFixed(2) + "s"}
+    </Td>
+  );
+
  return (
    <>
      <Tr
@@ -120,7 +128,7 @@ function TableRow({
          </Tooltip>
        </Td>
        <Td width="100%">{model}</Td>
-        <Td isNumeric>{((loggedCall.modelResponse?.durationMs ?? 0) / 1000).toFixed(2)}s</Td>
+        {durationCell}
        <Td isNumeric>{loggedCall.modelResponse?.inputTokens}</Td>
        <Td isNumeric>{loggedCall.modelResponse?.outputTokens}</Td>
        <Td sx={{ color: isError ? "red.500" : "green.500", fontWeight: "semibold" }} isNumeric>
--- a/app/src/server/api/routers/externalApi.router.ts
+++ b/app/src/server/api/routers/externalApi.router.ts
@@ -70,15 +70,9 @@ export const externalApiRouter = createTRPCRouter({
      const cacheKey = hashRequest(key.projectId, reqPayload as JsonValue);

      const existingResponse = await prisma.loggedCallModelResponse.findFirst({
-        where: {
-          cacheKey,
-        },
-        include: {
-          originalLoggedCall: true,
-        },
-        orderBy: {
-          requestedAt: "desc",
-        },
+        where: { cacheKey },
+        include: { originalLoggedCall: true },
+        orderBy: { requestedAt: "desc" },
      });

      if (!existingResponse) return { respPayload: null };
--- a/client-libs/python/openpipe/openai.py
+++ b/client-libs/python/openpipe/openai.py
@@ -1,23 +1,24 @@
 import openai as original_openai
+from openai.openai_object import OpenAIObject
 import time
 import inspect

 from openpipe.merge_openai_chunks import merge_streamed_chunks

-from .shared import report_async, report
+from .shared import maybe_check_cache, maybe_check_cache_async, report_async, report


-class ChatCompletionWrapper:
-    def __getattr__(self, name):
-        return getattr(original_openai.ChatCompletion, name)
-
-    def __setattr__(self, name, value):
-        return setattr(original_openai.ChatCompletion, name, value)
-
+class WrappedChatCompletion(original_openai.ChatCompletion):
    @classmethod
    def create(cls, *args, **kwargs):
        openpipe_options = kwargs.pop("openpipe", {})

+        cached_response = maybe_check_cache(
+            openpipe_options=openpipe_options, req_payload=kwargs
+        )
+        if cached_response:
+            return OpenAIObject.construct_from(cached_response, api_key=None)
+
        requested_at = int(time.time() * 1000)

        try:
@@ -86,6 +87,12 @@ class ChatCompletionWrapper:
    async def acreate(cls, *args, **kwargs):
        openpipe_options = kwargs.pop("openpipe", {})

+        cached_response = await maybe_check_cache_async(
+            openpipe_options=openpipe_options, req_payload=kwargs
+        )
+        if cached_response:
+            return OpenAIObject.construct_from(cached_response, api_key=None)
+
        requested_at = int(time.time() * 1000)

        try:
@@ -152,13 +159,10 @@ class ChatCompletionWrapper:


 class OpenAIWrapper:
-    ChatCompletion = ChatCompletionWrapper()
+    ChatCompletion = WrappedChatCompletion()

    def __getattr__(self, name):
        return getattr(original_openai, name)

    def __setattr__(self, name, value):
        return setattr(original_openai, name, value)
-
-    def __dir__(self):
-        return dir(original_openai) + ["openpipe_base_url", "openpipe_api_key"]
--- a/client-libs/python/openpipe/shared.py
+++ b/client-libs/python/openpipe/shared.py
@@ -1,9 +1,13 @@
-from openpipe.api_client.api.default import external_api_report
+from openpipe.api_client.api.default import (
+    external_api_report,
+    external_api_check_cache,
+)
 from openpipe.api_client.client import AuthenticatedClient
 from openpipe.api_client.models.external_api_report_json_body_tags import (
    ExternalApiReportJsonBodyTags,
 )
 import toml
+import time

 version = toml.load("pyproject.toml")["tool"]["poetry"]["version"]

@@ -20,6 +24,71 @@ def _get_tags(openpipe_options):
    return ExternalApiReportJsonBodyTags.from_dict(tags)


+def _should_check_cache(openpipe_options):
+    if configured_client.token == "":
+        return False
+    return openpipe_options.get("cache", False)
+
+
+def _process_cache_payload(
+    payload: external_api_check_cache.ExternalApiCheckCacheResponse200,
+):
+    if not payload or not payload.resp_payload:
+        return None
+    payload.resp_payload["openpipe"] = {"cache_status": "HIT"}
+
+    return payload.resp_payload
+
+
+def maybe_check_cache(
+    openpipe_options={},
+    req_payload={},
+):
+    if not _should_check_cache(openpipe_options):
+        return None
+    try:
+        payload = external_api_check_cache.sync(
+            client=configured_client,
+            json_body=external_api_check_cache.ExternalApiCheckCacheJsonBody(
+                req_payload=req_payload,
+                requested_at=int(time.time() * 1000),
+                tags=_get_tags(openpipe_options),
+            ),
+        )
+        return _process_cache_payload(payload)
+
+    except Exception as e:
+        # We don't want to break client apps if our API is down for some reason
+        print(f"Error reporting to OpenPipe: {e}")
+        print(e)
+        return None
+
+
+async def maybe_check_cache_async(
+    openpipe_options={},
+    req_payload={},
+):
+    if not _should_check_cache(openpipe_options):
+        return None
+
+    try:
+        payload = await external_api_check_cache.asyncio(
+            client=configured_client,
+            json_body=external_api_check_cache.ExternalApiCheckCacheJsonBody(
+                req_payload=req_payload,
+                requested_at=int(time.time() * 1000),
+                tags=_get_tags(openpipe_options),
+            ),
+        )
+        return _process_cache_payload(payload)
+
+    except Exception as e:
+        # We don't want to break client apps if our API is down for some reason
+        print(f"Error reporting to OpenPipe: {e}")
+        print(e)
+        return None
+
+
 def report(
    openpipe_options={},
    **kwargs,
--- a/client-libs/python/openpipe/test_client.py
+++ b/client-libs/python/openpipe/test_client.py
@@ -12,7 +12,6 @@ configure_openpipe(
 )


-@pytest.mark.skip
 def test_sync():
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
@@ -22,7 +21,6 @@ def test_sync():
    print(completion.choices[0].message.content)


-@pytest.mark.skip
 def test_streaming():
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
@@ -34,7 +32,6 @@ def test_streaming():
        print(chunk)


-@pytest.mark.skip
 async def test_async():
    acompletion = await openai.ChatCompletion.acreate(
        model="gpt-3.5-turbo",
@@ -44,7 +41,6 @@ async def test_async():
    print(acompletion.choices[0].message.content)


-@pytest.mark.skip
 async def test_async_streaming():
    acompletion = await openai.ChatCompletion.acreate(
        model="gpt-3.5-turbo",
@@ -67,10 +63,26 @@ def test_sync_with_tags():
    print(completion.choices[0].message.content)


-@pytest.mark.focus
 def test_bad_call():
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-blaster",
        messages=[{"role": "system", "content": "count to 10"}],
        stream=True,
    )
+
+
+@pytest.mark.focus
+async def test_caching():
+    completion = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo",
+        messages=[{"role": "system", "content": "count to 10"}],
+        openpipe={"cache": True},
+    )
+
+    completion2 = await openai.ChatCompletion.acreate(
+        model="gpt-3.5-turbo",
+        messages=[{"role": "system", "content": "count to 10"}],
+        openpipe={"cache": True},
+    )
+
+    print(completion2)