Rate limiting is critical for AI APIs. Here’s a robust implementation:
import Bottleneck from 'bottleneck';
const limiter = new Bottleneck({
reservoir: 60, // 60 requests
reservoirRefreshAmount: 60,
reservoirRefreshInterval: 60 * 1000, // per minute
maxConcurrent: 5,
minTime: 100 // 100ms between requests
});
// Wrap OpenAI calls
const rateLimitedChat = limiter.wrap(async (prompt) => {
return openai.chat.completions.create({
model: 'gpt-4',
messages: [{ role: 'user', content: prompt }]
});
});
// Use with automatic queuing
const results = await Promise.all(
prompts.map(p => rateLimitedChat(p))
);
Exponential Backoff
async function withRetry(fn, maxRetries = 3) {
for (let i = 0; i < maxRetries; i++) {
try {
return await fn();
} catch (e) {
if (e.status === 429 && i < maxRetries - 1) {
await new Promise(r => setTimeout(r, Math.pow(2, i) * 1000));
} else throw e;
}
}
}
