Source code for nano_llm.chat.kv_cache

#!/usr/bin/env python3


[docs]
class KVCache():
    """
    Abstract interface for storing & manipulating the KV cache,
    which encodes all the context and model state in the chat.
    These are implemented by different LLM backends and are
    backed by CUDA memory for each layer in the model, which
    these functions provide some modifications.  
    
    It gets returned in the :class:`StreamingResponse` iterator
    from :meth:`NanoLLM.generate()` and as an optional argument
    can be re-used during the next generation to grow the cache
    instead of having to refill the chat context each request.
    
    For example, :meth:`KVCache.pop` will drop the most recent
    N tokens off the end of the cache, while :meth:`KVCache.remove`
    while remove a range of tokens from anywhere in the cache.
    
    The :class:`ChatHistory` object provides a higher-level way
    of maintaining consistency for removing messages from the chat
    by keeping track of their token counts and positions in the chat.
    It also keeps the KV cache between requests, so that only the
    new tokens need to be added (and the model only processes those).
    """
    def __init__(self):
        super().__init__()
        
        #: The current length of the KV cache
        self.num_tokens = 0
  

[docs]
    def __len__(self):
        """
        Return the current length of the cache in terms of tokens or embedding positions.
        """
        return self.num_tokens

    

[docs]
    def pop(self, tokens):
        """
        Remove the given number of tokens from the end of the cache.
        """
        raise NotImplementedError(f"{type(self)} did not implement pop()")



[docs]
    def remove(self, start, stop, sync=True):
        """
        Remove a range of tokens from the cache, from the start index (inclusive) to the stop index (exclusive)
        """
        raise NotImplementedError(f"{type(self)} did not implement pop()")