Source code for nano_llm.chat.kv_cache

#!/usr/bin/env python3

[docs] class KVCache(): """ Abstract interface for storing & manipulating the KV cache, which encodes all the context and model state in the chat. These are implemented by different LLM backends and are backed by CUDA memory for each layer in the model, which these functions provide some modifications. It gets returned in the :class:`StreamingResponse` iterator from :meth:`NanoLLM.generate()` and as an optional argument can be re-used during the next generation to grow the cache instead of having to refill the chat context each request. For example, :meth:`KVCache.pop` will drop the most recent N tokens off the end of the cache, while :meth:`KVCache.remove` while remove a range of tokens from anywhere in the cache. The :class:`ChatHistory` object provides a higher-level way of maintaining consistency for removing messages from the chat by keeping track of their token counts and positions in the chat. It also keeps the KV cache between requests, so that only the new tokens need to be added (and the model only processes those). """ def __init__(self): super().__init__() #: The current length of the KV cache self.num_tokens = 0
[docs] def __len__(self): """ Return the current length of the cache in terms of tokens or embedding positions. """ return self.num_tokens
[docs] def pop(self, tokens): """ Remove the given number of tokens from the end of the cache. """ raise NotImplementedError(f"{type(self)} did not implement pop()")
[docs] def remove(self, start, stop, sync=True): """ Remove a range of tokens from the cache, from the start index (inclusive) to the stop index (exclusive) """ raise NotImplementedError(f"{type(self)} did not implement pop()")