Caching LLM responses saves money and improves latency:
import { createHash } from 'crypto';
import Redis from 'ioredis';
const redis = new Redis();
const CACHE_TTL = 3600; // 1 hour
function hashPrompt(messages, model) {
const content = JSON.stringify({ messages, model });
return createHash('sha256').update(content).digest('hex');
}
async function cachedChat(messages, options = {}) {
const { model = 'gpt-4', bypassCache = false } = options;
const cacheKey = `llm:${hashPrompt(messages, model)}`;
if (!bypassCache) {
const cached = await redis.get(cacheKey);
if (cached) {
console.log('Cache HIT');
return JSON.parse(cached);
}
}
console.log('Cache MISS');
const response = await openai.chat.completions.create({
model,
messages
});
await redis.setex(cacheKey, CACHE_TTL, JSON.stringify(response));
return response;
}
Semantic Caching
For similar (not exact) queries, use embedding similarity with a threshold.
