version: "3"

services:
  huggingface_inference:
    image: ghcr.io/huggingface/text-generation-inference:1.1.0
    volumes:
      - ./models/:/data
    ports:
      - "3000:80"
    shm_size: '1gb'
    command: --model-id meta-llama/Llama-2-7b-chat-hf --sharded true --num-shards 2
#    --json-output
    environment:
      # - num_shard=1
      - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
      - DISABLE_CUSTOM_KERNELS=${DISABLE_CUSTOM_KERNELS}
      - HF_HUB_ENABLE_HF_TRANSFER={HF_HUB_ENABLE_HF_TRANSFER}
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    working_dir: /app

  mongo_chatui:
    image: mongo:latest
    ports:
      - "27017:27017"

  chat_ui:
    build:
      context: ./chat-ui
      dockerfile: Dockerfile
    command: >
      sh -c "npm run dev -- --host"
    volumes:
      - ./chat-ui/.env.local:/app/chat-ui/.env.local
    ports:
      - "5173:5173"
    depends_on:
      - mongo_chatui
      - huggingface_inference

#  llamacpp:
#    image: ghcr.io/ggerganov/llama.cpp:full-cuda
  # debug:
  #   image: ubuntu:latest
  #   entrypoint: /bin/sh
  #   stdin_open: true # docker run -i
  #   tty: true        # docker run -t
  #   volumes:
  #     - ./models/:/data

# 70b
# meta-llama/Llama-2-70b-chat-hf

# 70b q
# TheBloke/Llama-2-70B-chat-AWQ

# 7b
# meta-llama/Llama-2-7b-chat-hf
# mistralai/Mistral-7B-Instruct-v0.1

# 7b q
# TheBloke/Mistral-7B-Instruct-v0.1-AWQ
# TheBloke/Llama-2-7b-Chat-AWQ

# daryl149/llama-2-7b-chat-hf
# georgesung/llama2_7b_chat_uncensored