Implementing LLM Response Caching with Redis

Caching LLM responses saves money and improves latency:

import { createHash } from 'crypto';
import Redis from 'ioredis';

const redis = new Redis();
const CACHE_TTL = 3600; // 1 hour

function hashPrompt(messages, model) {
  const content = JSON.stringify({ messages, model });
  return createHash('sha256').update(content).digest('hex');
}

async function cachedChat(messages, options = {}) {
  const { model = 'gpt-4', bypassCache = false } = options;
  const cacheKey = `llm:${hashPrompt(messages, model)}`;

  if (!bypassCache) {
    const cached = await redis.get(cacheKey);
    if (cached) {
      console.log('Cache HIT');
      return JSON.parse(cached);
    }
  }

  console.log('Cache MISS');
  const response = await openai.chat.completions.create({
    model,
    messages
  });

  await redis.setex(cacheKey, CACHE_TTL, JSON.stringify(response));
  return response;
}

Semantic Caching

For similar (not exact) queries, use embedding similarity with a threshold.

Text Chunking Strategies for RAG Applications

Chunking strategies greatly affect RAG quality:

import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';

// Basic chunking
const splitter = new RecursiveCharacterTextSplitter({
  chunkSize: 1000,
  chunkOverlap: 200,
  separators: ['

', '
', ' ', '']
});

const chunks = await splitter.splitText(document);

Semantic Chunking

async function semanticChunk(text, maxTokens = 500) {
  const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
  const chunks = [];
  let current = [];
  let tokenCount = 0;

  for (const sentence of sentences) {
    const tokens = sentence.split(/s+/).length; // Approximate
    if (tokenCount + tokens > maxTokens && current.length) {
      chunks.push(current.join(' '));
      current = [];
      tokenCount = 0;
    }
    current.push(sentence);
    tokenCount += tokens;
  }
  if (current.length) chunks.push(current.join(' '));
  return chunks;
}

Best Practices

  • Chunk size: 500-1000 tokens
  • Overlap: 10-20% for context
  • Preserve semantic boundaries

Building Conversational AI with Context Memory in Node.js

Building a conversational AI with memory requires careful context management:

class ConversationManager {
  constructor(options = {}) {
    this.maxTokens = options.maxTokens || 4000;
    this.systemPrompt = options.systemPrompt || 'You are a helpful assistant.';
    this.conversations = new Map();
  }

  getHistory(sessionId) {
    if (!this.conversations.has(sessionId)) {
      this.conversations.set(sessionId, []);
    }
    return this.conversations.get(sessionId);
  }

  async chat(sessionId, userMessage) {
    const history = this.getHistory(sessionId);
    history.push({ role: 'user', content: userMessage });

    // Trim history if too long
    while (this.estimateTokens(history) > this.maxTokens) {
      history.shift();
    }

    const response = await openai.chat.completions.create({
      model: 'gpt-4',
      messages: [
        { role: 'system', content: this.systemPrompt },
        ...history
      ]
    });

    const reply = response.choices[0].message.content;
    history.push({ role: 'assistant', content: reply });
    return reply;
  }

  estimateTokens(messages) {
    return messages.reduce((sum, m) => sum + m.content.length / 4, 0);
  }
}