Store input_tokens, output_tokens, token_details on Response, refs #610

simonw · simonw · commit 80956da83dea · 2024-11-19T18:30:28.000-08:00
diff --git a/llm/cli.py b/llm/cli.py
@@ -754,6 +754,9 @@ def logs_turn_off():
     responses.conversation_id,
     responses.duration_ms,
     responses.datetime_utc,
+    responses.input_tokens,
+    responses.output_tokens,
+    responses.token_details,
     conversations.name as conversation_name,
     conversations.model as conversation_model"""
 
diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
@@ -1,6 +1,11 @@
 from llm import AsyncModel, EmbeddingModel, Model, hookimpl
 import llm
-from llm.utils import dicts_to_table_string, remove_dict_none_values, logging_client
+from llm.utils import (
+    dicts_to_table_string,
+    remove_dict_none_values,
+    logging_client,
+    simplify_usage_dict,
+)
 import click
 import datetime
 import httpx
@@ -391,6 +396,16 @@ def build_messages(self, prompt, conversation):
             messages.append({"role": "user", "content": attachment_message})
         return messages
 
+    def set_usage(self, response, usage):
+        if not usage:
+            return
+        input_tokens = usage.pop("prompt_tokens")
+        output_tokens = usage.pop("completion_tokens")
+        usage.pop("total_tokens")
+        response.set_usage(
+            input=input_tokens, output=output_tokens, details=simplify_usage_dict(usage)
+        )
+
     def get_client(self, async_=False):
         kwargs = {}
         if self.api_base:
@@ -445,6 +460,7 @@ def execute(self, prompt, stream, response, conversation=None):
         messages = self.build_messages(prompt, conversation)
         kwargs = self.build_kwargs(prompt, stream)
         client = self.get_client()
+        usage = None
         if stream:
             completion = client.chat.completions.create(
                 model=self.model_name or self.model_id,
@@ -455,6 +471,8 @@ def execute(self, prompt, stream, response, conversation=None):
             chunks = []
             for chunk in completion:
                 chunks.append(chunk)
+                if chunk.usage:
+                    usage = chunk.usage.model_dump()
                 try:
                     content = chunk.choices[0].delta.content
                 except IndexError:
@@ -469,8 +487,10 @@ def execute(self, prompt, stream, response, conversation=None):
                 stream=False,
                 **kwargs,
             )
+            usage = completion.usage.model_dump()
             response.response_json = remove_dict_none_values(completion.model_dump())
             yield completion.choices[0].message.content
+        self.set_usage(response, usage)
         response._prompt_json = redact_data({"messages": messages})
 
 
@@ -493,6 +513,7 @@ async def execute(
         messages = self.build_messages(prompt, conversation)
         kwargs = self.build_kwargs(prompt, stream)
         client = self.get_client(async_=True)
+        usage = None
         if stream:
             completion = await client.chat.completions.create(
                 model=self.model_name or self.model_id,
@@ -502,6 +523,8 @@ async def execute(
             )
             chunks = []
             async for chunk in completion:
+                if chunk.usage:
+                    usage = chunk.usage.model_dump()
                 chunks.append(chunk)
                 try:
                     content = chunk.choices[0].delta.content
@@ -518,7 +541,9 @@ async def execute(
                 **kwargs,
             )
             response.response_json = remove_dict_none_values(completion.model_dump())
+            usage = completion.usage.model_dump()
             yield completion.choices[0].message.content
+        self.set_usage(response, usage)
         response._prompt_json = redact_data({"messages": messages})
 
 
diff --git a/llm/migrations.py b/llm/migrations.py
@@ -227,3 +227,10 @@ def m012_attachments_tables(db):
         ),
         pk=("response_id", "attachment_id"),
     )
+
+
+@migration
+def m013_usage(db):
+    db["responses"].add_column("input_tokens", int)
+    db["responses"].add_column("output_tokens", int)
+    db["responses"].add_column("token_details", str)
diff --git a/llm/models.py b/llm/models.py
@@ -208,6 +208,20 @@ def __init__(
         self._start: Optional[float] = None
         self._end: Optional[float] = None
         self._start_utcnow: Optional[datetime.datetime] = None
+        self.input_tokens: Optional[int] = None
+        self.output_tokens: Optional[int] = None
+        self.token_details: Optional[dict] = None
+
+    def set_usage(
+        self,
+        *,
+        input: Optional[int] = None,
+        output: Optional[int] = None,
+        details: Optional[dict] = None,
+    ):
+        self.input_tokens = input
+        self.output_tokens = output
+        self.token_details = details
 
     @classmethod
     def from_row(cls, db, row):
@@ -272,11 +286,16 @@ def log_to_db(self, db):
                 for key, value in dict(self.prompt.options).items()
                 if value is not None
             },
-            "response": self.text(),
+            "response": self.text_or_raise(),
             "response_json": self.json(),
             "conversation_id": conversation.id,
             "duration_ms": self.duration_ms(),
             "datetime_utc": self.datetime_utc(),
+            "input_tokens": self.input_tokens,
+            "output_tokens": self.input_tokens,
+            "token_details": (
+                json.dumps(self.token_details) if self.token_details else None
+            ),
         }
         db["responses"].insert(response)
         # Persist any attachments - loop through with index
@@ -439,6 +458,9 @@ async def to_sync_response(self) -> Response:
         response._end = self._end
         response._start = self._start
         response._start_utcnow = self._start_utcnow
+        response.input_tokens = self.input_tokens
+        response.output_tokens = self.output_tokens
+        response.token_details = self.token_details
         return response
 
     @classmethod
diff --git a/llm/utils.py b/llm/utils.py
@@ -127,3 +127,18 @@ def logging_client() -> httpx.Client:
         transport=_LogTransport(httpx.HTTPTransport()),
         event_hooks={"request": [_no_accept_encoding], "response": [_log_response]},
     )
+
+
+def simplify_usage_dict(d):
+    # Recursively remove keys with value 0 and empty dictionaries
+    def remove_empty_and_zero(obj):
+        if isinstance(obj, dict):
+            cleaned = {
+                k: remove_empty_and_zero(v)
+                for k, v in obj.items()
+                if v != 0 and v != {}
+            }
+            return {k: v for k, v in cleaned.items() if v is not None and v != {}}
+        return obj
+
+    return remove_empty_and_zero(d) or {}
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -66,13 +66,17 @@ def enqueue(self, messages):
 
     def execute(self, prompt, stream, response, conversation):
         self.history.append((prompt, stream, response, conversation))
+        gathered = []
         while True:
             try:
                 messages = self._queue.pop(0)
-                yield from messages
+                for message in messages:
+                    gathered.append(message)
+                    yield message
                 break
             except IndexError:
                 break
+        response.set_usage(input=len(prompt.prompt.split()), output=len(gathered))
 
 
 class AsyncMockModel(llm.AsyncModel):
diff --git a/tests/test_chat.py b/tests/test_chat.py
@@ -62,6 +62,9 @@ def test_chat_basic(mock_model, logs_db):
             "conversation_id": conversation_id,
             "duration_ms": ANY,
             "datetime_utc": ANY,
+            "input_tokens": 1,
+            "output_tokens": 1,
+            "token_details": None,
         },
         {
             "id": ANY,
@@ -75,6 +78,9 @@ def test_chat_basic(mock_model, logs_db):
             "conversation_id": conversation_id,
             "duration_ms": ANY,
             "datetime_utc": ANY,
+            "input_tokens": 2,
+            "output_tokens": 2,
+            "token_details": None,
         },
     ]
     # Now continue that conversation
@@ -116,6 +122,9 @@ def test_chat_basic(mock_model, logs_db):
             "conversation_id": conversation_id,
             "duration_ms": ANY,
             "datetime_utc": ANY,
+            "input_tokens": 1,
+            "output_tokens": 1,
+            "token_details": None,
         }
     ]
 
@@ -153,6 +162,9 @@ def test_chat_system(mock_model, logs_db):
             "conversation_id": ANY,
             "duration_ms": ANY,
             "datetime_utc": ANY,
+            "input_tokens": 1,
+            "output_tokens": 1,
+            "token_details": None,
         }
     ]
 
@@ -181,6 +193,9 @@ def test_chat_options(mock_model, logs_db):
             "conversation_id": ANY,
             "duration_ms": ANY,
             "datetime_utc": ANY,
+            "input_tokens": 1,
+            "output_tokens": 1,
+            "token_details": None,
         }
     ]
 
diff --git a/tests/test_migrate.py b/tests/test_migrate.py
@@ -17,6 +17,9 @@
     "conversation_id": str,
     "duration_ms": int,
     "datetime_utc": str,
+    "input_tokens": int,
+    "output_tokens": int,
+    "token_details": str,
 }
 
 

Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,9 @@ def test_chat_basic(mock_model, logs_db):`
`62`	`62`	`"conversation_id": conversation_id,`
`63`	`63`	`"duration_ms": ANY,`
`64`	`64`	`"datetime_utc": ANY,`
	`65`	`+ "input_tokens": 1,`
	`66`	`+ "output_tokens": 1,`
	`67`	`+ "token_details": None,`
`65`	`68`	`},`
`66`	`69`	`{`
`67`	`70`	`"id": ANY,`
`@@ -75,6 +78,9 @@ def test_chat_basic(mock_model, logs_db):`
`75`	`78`	`"conversation_id": conversation_id,`
`76`	`79`	`"duration_ms": ANY,`
`77`	`80`	`"datetime_utc": ANY,`
	`81`	`+ "input_tokens": 2,`
	`82`	`+ "output_tokens": 2,`
	`83`	`+ "token_details": None,`
`78`	`84`	`},`
`79`	`85`	`]`
`80`	`86`	`# Now continue that conversation`
`@@ -116,6 +122,9 @@ def test_chat_basic(mock_model, logs_db):`
`116`	`122`	`"conversation_id": conversation_id,`
`117`	`123`	`"duration_ms": ANY,`
`118`	`124`	`"datetime_utc": ANY,`
	`125`	`+ "input_tokens": 1,`
	`126`	`+ "output_tokens": 1,`
	`127`	`+ "token_details": None,`
`119`	`128`	`}`
`120`	`129`	`]`
`121`	`130`
`@@ -153,6 +162,9 @@ def test_chat_system(mock_model, logs_db):`
`153`	`162`	`"conversation_id": ANY,`
`154`	`163`	`"duration_ms": ANY,`
`155`	`164`	`"datetime_utc": ANY,`
	`165`	`+ "input_tokens": 1,`
	`166`	`+ "output_tokens": 1,`
	`167`	`+ "token_details": None,`
`156`	`168`	`}`
`157`	`169`	`]`
`158`	`170`
`@@ -181,6 +193,9 @@ def test_chat_options(mock_model, logs_db):`
`181`	`193`	`"conversation_id": ANY,`
`182`	`194`	`"duration_ms": ANY,`
`183`	`195`	`"datetime_utc": ANY,`
	`196`	`+ "input_tokens": 1,`
	`197`	`+ "output_tokens": 1,`
	`198`	`+ "token_details": None,`
`184`	`199`	`}`
`185`	`200`	`]`
`186`	`201`
Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,9 @@`
`17`	`17`	`"conversation_id": str,`
`18`	`18`	`"duration_ms": int,`
`19`	`19`	`"datetime_utc": str,`
	`20`	`+ "input_tokens": int,`
	`21`	`+ "output_tokens": int,`
	`22`	`+ "token_details": str,`
`20`	`23`	`}`
`21`	`24`
`22`	`25`