Essential AI Libraries for Node.js Developers

Getting started with AI in Node.js is simpler than you might think. The key libraries to know are:

1. OpenAI SDK

npm install openai

import OpenAI from 'openai';

const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });

const completion = await openai.chat.completions.create({
  model: 'gpt-4',
  messages: [{ role: 'user', content: 'Hello!' }]
});

console.log(completion.choices[0].message.content);

2. LangChain.js

For more complex chains and agents:

npm install langchain

3. Hugging Face Transformers

For local model inference:

npm install @huggingface/inference

These three libraries cover 90% of AI use cases in Node.js applications.

Streaming LLM Responses in Node.js with Server-Sent Events

Streaming responses from LLMs provides a much better UX. Here’s how to implement it properly:

import OpenAI from 'openai';
import { Readable } from 'stream';

const openai = new OpenAI();

async function streamChat(prompt) {
  const stream = await openai.chat.completions.create({
    model: 'gpt-4',
    messages: [{ role: 'user', content: prompt }],
    stream: true
  });

  for await (const chunk of stream) {
    const content = chunk.choices[0]?.delta?.content || '';
    process.stdout.write(content);
  }
}

await streamChat('Explain async iterators');

Express.js SSE Integration

app.get('/chat', async (req, res) => {
  res.setHeader('Content-Type', 'text/event-stream');
  res.setHeader('Cache-Control', 'no-cache');
  
  const stream = await openai.chat.completions.create({
    model: 'gpt-4',
    messages: [{ role: 'user', content: req.query.prompt }],
    stream: true
  });

  for await (const chunk of stream) {
    res.write(`data: ${JSON.stringify(chunk)}

`);
  }
  res.end();
});

Building a Vector Database Pipeline with Pinecone and Node.js

Vector databases are essential for RAG (Retrieval Augmented Generation). Here’s a complete setup with Pinecone:

npm install @pinecone-database/pinecone openai

import { Pinecone } from '@pinecone-database/pinecone';
import OpenAI from 'openai';

const pinecone = new Pinecone({ apiKey: process.env.PINECONE_API_KEY });
const openai = new OpenAI();
const index = pinecone.Index('documents');

// Generate embeddings
async function embed(text) {
  const response = await openai.embeddings.create({
    model: 'text-embedding-3-small',
    input: text
  });
  return response.data[0].embedding;
}

// Store document
async function storeDoc(id, content) {
  const vector = await embed(content);
  await index.upsert([{ id, values: vector, metadata: { content } }]);
}

// Query similar docs
async function querySimilar(query, topK = 5) {
  const vector = await embed(query);
  const results = await index.query({ vector, topK, includeMetadata: true });
  return results.matches.map(m => m.metadata.content);
}

Implementing OpenAI Function Calling in Node.js

Function calling lets LLMs interact with your APIs. Here’s a production pattern:

const tools = [{
  type: 'function',
  function: {
    name: 'get_weather',
    description: 'Get current weather for a location',
    parameters: {
      type: 'object',
      properties: {
        location: { type: 'string', description: 'City name' },
        unit: { type: 'string', enum: ['celsius', 'fahrenheit'] }
      },
      required: ['location']
    }
  }
}];

const functions = {
  get_weather: async ({ location, unit = 'celsius' }) => {
    // Your API call here
    return { temp: 22, condition: 'sunny', location };
  }
};

async function chat(message) {
  const response = await openai.chat.completions.create({
    model: 'gpt-4',
    messages: [{ role: 'user', content: message }],
    tools
  });

  const toolCalls = response.choices[0].message.tool_calls;
  if (toolCalls) {
    for (const call of toolCalls) {
      const fn = functions[call.function.name];
      const args = JSON.parse(call.function.arguments);
      const result = await fn(args);
      console.log(result);
    }
  }
}

Running Local LLMs with Ollama and Node.js

Run LLMs locally without API costs using Ollama:

# Install Ollama
curl -fsSL https://ollama.ai/install.sh | sh

# Pull a model
ollama pull llama2
ollama pull codellama

Node.js Integration

npm install ollama

import { Ollama } from 'ollama';

const ollama = new Ollama();

// Simple completion
const response = await ollama.chat({
  model: 'llama2',
  messages: [{ role: 'user', content: 'Explain closures in JS' }]
});

console.log(response.message.content);

// Streaming
const stream = await ollama.chat({
  model: 'codellama',
  messages: [{ role: 'user', content: 'Write a fibonacci function' }],
  stream: true
});

for await (const chunk of stream) {
  process.stdout.write(chunk.message.content);
}

Memory Requirements

  • 7B models: 8GB RAM
  • 13B models: 16GB RAM
  • 70B models: 64GB+ RAM

Rate Limiting AI API Calls in Node.js with Bottleneck

Rate limiting is critical for AI APIs. Here’s a robust implementation:

import Bottleneck from 'bottleneck';

const limiter = new Bottleneck({
  reservoir: 60,           // 60 requests
  reservoirRefreshAmount: 60,
  reservoirRefreshInterval: 60 * 1000, // per minute
  maxConcurrent: 5,
  minTime: 100             // 100ms between requests
});

// Wrap OpenAI calls
const rateLimitedChat = limiter.wrap(async (prompt) => {
  return openai.chat.completions.create({
    model: 'gpt-4',
    messages: [{ role: 'user', content: prompt }]
  });
});

// Use with automatic queuing
const results = await Promise.all(
  prompts.map(p => rateLimitedChat(p))
);

Exponential Backoff

async function withRetry(fn, maxRetries = 3) {
  for (let i = 0; i < maxRetries; i++) {
    try {
      return await fn();
    } catch (e) {
      if (e.status === 429 && i < maxRetries - 1) {
        await new Promise(r => setTimeout(r, Math.pow(2, i) * 1000));
      } else throw e;
    }
  }
}

Building Autonomous AI Agents with LangChain.js

LangChain agents can use tools autonomously. Here’s a complete agent setup:

import { ChatOpenAI } from '@langchain/openai';
import { AgentExecutor, createOpenAIToolsAgent } from 'langchain/agents';
import { DynamicTool } from '@langchain/core/tools';
import { ChatPromptTemplate } from '@langchain/core/prompts';

const tools = [
  new DynamicTool({
    name: 'calculator',
    description: 'Performs math calculations',
    func: async (input) => {
      return String(eval(input)); // Use mathjs in production
    }
  }),
  new DynamicTool({
    name: 'search',
    description: 'Search the web',
    func: async (query) => {
      // Your search API here
      return `Results for: ${query}`;
    }
  })
];

const llm = new ChatOpenAI({ model: 'gpt-4' });
const prompt = ChatPromptTemplate.fromMessages([
  ['system', 'You are a helpful assistant with access to tools.'],
  ['human', '{input}'],
  ['placeholder', '{agent_scratchpad}']
]);

const agent = await createOpenAIToolsAgent({ llm, tools, prompt });
const executor = new AgentExecutor({ agent, tools });

const result = await executor.invoke({
  input: 'What is 25 * 48 and search for Node.js news'
});

Implementing LLM Response Caching with Redis

Caching LLM responses saves money and improves latency:

import { createHash } from 'crypto';
import Redis from 'ioredis';

const redis = new Redis();
const CACHE_TTL = 3600; // 1 hour

function hashPrompt(messages, model) {
  const content = JSON.stringify({ messages, model });
  return createHash('sha256').update(content).digest('hex');
}

async function cachedChat(messages, options = {}) {
  const { model = 'gpt-4', bypassCache = false } = options;
  const cacheKey = `llm:${hashPrompt(messages, model)}`;

  if (!bypassCache) {
    const cached = await redis.get(cacheKey);
    if (cached) {
      console.log('Cache HIT');
      return JSON.parse(cached);
    }
  }

  console.log('Cache MISS');
  const response = await openai.chat.completions.create({
    model,
    messages
  });

  await redis.setex(cacheKey, CACHE_TTL, JSON.stringify(response));
  return response;
}

Semantic Caching

For similar (not exact) queries, use embedding similarity with a threshold.

Text Chunking Strategies for RAG Applications

Chunking strategies greatly affect RAG quality:

import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';

// Basic chunking
const splitter = new RecursiveCharacterTextSplitter({
  chunkSize: 1000,
  chunkOverlap: 200,
  separators: ['

', '
', ' ', '']
});

const chunks = await splitter.splitText(document);

Semantic Chunking

async function semanticChunk(text, maxTokens = 500) {
  const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
  const chunks = [];
  let current = [];
  let tokenCount = 0;

  for (const sentence of sentences) {
    const tokens = sentence.split(/s+/).length; // Approximate
    if (tokenCount + tokens > maxTokens && current.length) {
      chunks.push(current.join(' '));
      current = [];
      tokenCount = 0;
    }
    current.push(sentence);
    tokenCount += tokens;
  }
  if (current.length) chunks.push(current.join(' '));
  return chunks;
}

Best Practices

  • Chunk size: 500-1000 tokens
  • Overlap: 10-20% for context
  • Preserve semantic boundaries

Building Conversational AI with Context Memory in Node.js

Building a conversational AI with memory requires careful context management:

class ConversationManager {
  constructor(options = {}) {
    this.maxTokens = options.maxTokens || 4000;
    this.systemPrompt = options.systemPrompt || 'You are a helpful assistant.';
    this.conversations = new Map();
  }

  getHistory(sessionId) {
    if (!this.conversations.has(sessionId)) {
      this.conversations.set(sessionId, []);
    }
    return this.conversations.get(sessionId);
  }

  async chat(sessionId, userMessage) {
    const history = this.getHistory(sessionId);
    history.push({ role: 'user', content: userMessage });

    // Trim history if too long
    while (this.estimateTokens(history) > this.maxTokens) {
      history.shift();
    }

    const response = await openai.chat.completions.create({
      model: 'gpt-4',
      messages: [
        { role: 'system', content: this.systemPrompt },
        ...history
      ]
    });

    const reply = response.choices[0].message.content;
    history.push({ role: 'assistant', content: reply });
    return reply;
  }

  estimateTokens(messages) {
    return messages.reduce((sum, m) => sum + m.content.length / 4, 0);
  }
}