version: "3" services: huggingface_inference: image: ghcr.io/huggingface/text-generation-inference:1.1.0 volumes: - ./models/:/data ports: - "3000:80" shm_size: '1gb' command: --model-id meta-llama/Llama-2-7b-chat-hf --sharded true --num-shards 2 # --json-output environment: # - num_shard=1 - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN} - DISABLE_CUSTOM_KERNELS=${DISABLE_CUSTOM_KERNELS} - HF_HUB_ENABLE_HF_TRANSFER={HF_HUB_ENABLE_HF_TRANSFER} deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] working_dir: /app mongo_chatui: image: mongo:latest ports: - "27017:27017" chat_ui: build: context: ./chat-ui dockerfile: Dockerfile command: > sh -c "npm run dev -- --host" volumes: - ./chat-ui/.env.local:/app/chat-ui/.env.local ports: - "5173:5173" depends_on: - mongo_chatui - huggingface_inference # llamacpp: # image: ghcr.io/ggerganov/llama.cpp:full-cuda # debug: # image: ubuntu:latest # entrypoint: /bin/sh # stdin_open: true # docker run -i # tty: true # docker run -t # volumes: # - ./models/:/data # 70b # meta-llama/Llama-2-70b-chat-hf # 70b q # TheBloke/Llama-2-70B-chat-AWQ # 7b # meta-llama/Llama-2-7b-chat-hf # mistralai/Mistral-7B-Instruct-v0.1 # 7b q # TheBloke/Mistral-7B-Instruct-v0.1-AWQ # TheBloke/Llama-2-7b-Chat-AWQ # daryl149/llama-2-7b-chat-hf # georgesung/llama2_7b_chat_uncensored