Building Full-Stack AI Apps with FastAPI and React

FastAPI has become my go-to backend framework for AI applications. It's fast, Pythonic, has excellent async support, and auto-generates OpenAPI docs. Here's how I structure full-stack AI apps using FastAPI and React.

Project Structure

project/
├── backend/
│   ├── main.py
│   ├── routers/
│   │   └── chat.py
│   ├── services/
│   │   └── llm_service.py
│   └── requirements.txt
├── frontend/
│   ├── src/
│   │   ├── api/
│   │   └── components/
│   └── package.json
└── docker-compose.yml

Setting Up FastAPI

# backend/main.py
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from routers import chat

app = FastAPI(title="AI Chat API", version="1.0.0")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["http://localhost:5173"],  # Vite dev server
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

app.include_router(chat.router, prefix="/api/chat", tags=["chat"])

Streaming LLM Responses

The best UX for LLM-powered apps is streaming — showing tokens as they arrive rather than waiting for the full response. FastAPI + StreamingResponse makes this easy:

# backend/routers/chat.py
from fastapi import APIRouter
from fastapi.responses import StreamingResponse
from langchain_community.llms import Ollama

router = APIRouter()
llm = Ollama(model="llama3.2")

async def generate_stream(prompt: str):
    async for chunk in llm.astream(prompt):
        yield f"data: {chunk}\n\n"
    yield "data: [DONE]\n\n"

@router.post("/stream")
async def chat_stream(request: ChatRequest):
    return StreamingResponse(
        generate_stream(request.message),
        media_type="text/event-stream",
    )

Consuming SSE in React

// frontend/src/components/Chat.tsx
import { useState } from "react";

export function Chat() {
  const [response, setResponse] = useState("");

  const sendMessage = async (message: string) => {
    setResponse("");
    const res = await fetch("http://localhost:8000/api/chat/stream", {
      method: "POST",
      headers: { "Content-Type": "application/json" },
      body: JSON.stringify({ message }),
    });

    const reader = res.body!.getReader();
    const decoder = new TextDecoder();

    while (true) {
      const { done, value } = await reader.read();
      if (done) break;

      const text = decoder.decode(value);
      const lines = text.split("\n");

      for (const line of lines) {
        if (line.startsWith("data: ") && line !== "data: [DONE]") {
          setResponse((prev) => prev + line.slice(6));
        }
      }
    }
  };

  return (
    <div>
      <button onClick={() => sendMessage("Hello!")}>Send</button>
      <p>{response}</p>
    </div>
  );
}

Docker Compose Setup

# docker-compose.yml
version: "3.8"
services:
  backend:
    build: ./backend
    ports:
      - "8000:8000"
    volumes:
      - ./backend:/app
    environment:
      - OLLAMA_HOST=http://ollama:11434

  frontend:
    build: ./frontend
    ports:
      - "5173:5173"
    depends_on:
      - backend

  ollama:
    image: ollama/ollama
    ports:
      - "11434:11434"
    volumes:
      - ollama_data:/root/.ollama

volumes:
  ollama_data:

Key Patterns I've Learned

Always use async in FastAPI when calling external APIs or LLMs — don't block the event loop
Pydantic models for request/response validation save you hours of debugging
Redis for caching common queries can dramatically reduce LLM load
Rate limiting with slowapi is essential before deploying any public AI endpoint

This stack powers most of my current AI projects and scales surprisingly well for small-to-medium workloads. Give it a try!