123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119 |
- from langchain_community.chat_models import ChatOllama
- from langchain_openai import ChatOpenAI
- from transformers import AutoModelForCausalLM, AutoTokenizer,pipeline
- import torch
- from langchain_huggingface import HuggingFacePipeline
- from typing import Any, List, Optional, Dict
- from langchain_core.callbacks import CallbackManagerForLLMRun
- from langchain_core.language_models import BaseChatModel
- from langchain_core.messages import BaseMessage, AIMessage, HumanMessage, SystemMessage
- from langchain_core.outputs import ChatResult, ChatGeneration
- from pydantic import Field
- import subprocess
- import time
- from dotenv import load_dotenv
- load_dotenv()
- system_prompt: str = "你是一個來自台灣的AI助理,你的名字是 TAIDE,樂於以台灣人的立場幫助使用者,會用繁體中文回答問題。"
- def hf():
- model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
- tokenizer = AutoTokenizer.from_pretrained(model_id)
- llm = HuggingFacePipeline.from_model_id(
- model_id=model_id,
- task="text-generation",
- model_kwargs={"torch_dtype": torch.bfloat16},
- pipeline_kwargs={"return_full_text": False,
- "max_new_tokens": 512,
- "repetition_penalty":1.03},
- device=0, device_map='cuda')
- # print(llm.pipeline)
- llm.pipeline.tokenizer.pad_token_id = llm.pipeline.model.config.eos_token_id[0]
- return llm
- def ollama_():
- # model = "cwchang/llama3-taide-lx-8b-chat-alpha1"
- model = "llama3.1:latest"
- # model = "llama3.1:70b"
- # model = "893379029/piccolo-large-zh-v2"
- sys = "你是一個來自台灣的 AI 助理,,樂於以台灣人的立場幫助使用者,會用繁體中文回答問題。請用 5 句話以內回答問題。"
- # llm = ChatOllama(model=model, num_gpu=2, num_thread=32, temperature=0, system=sys, keep_alive="10m", verbose=True)
- llm = ChatOllama(model=model, num_gpu=2, temperature=0, system=sys, keep_alive="10m")
- return llm
- def openai_(): # not lacal
- llm = ChatOpenAI(temperature=0, model="gpt-4o-mini")
- return llm
- class OllamaChatModel(BaseChatModel):
- model_name: str = Field(default="taide-local-llama3")
- def _generate(
- self,
- messages: List[BaseMessage],
- stop: Optional[List[str]] = None,
- run_manager: Optional[CallbackManagerForLLMRun] = None,
- **kwargs: Any,
- ) -> ChatResult:
- formatted_messages = []
- for msg in messages:
- if isinstance(msg, HumanMessage):
- formatted_messages.append({"role": "user", "content": msg.content})
- elif isinstance(msg, AIMessage):
- formatted_messages.append({"role": "assistant", "content": msg.content})
- elif isinstance(msg, SystemMessage):
- formatted_messages.append({"role": "system", "content": msg.content})
- # prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n" # TAIDE llama2
- prompt = f"<|begin_of_text|><|start_header_id|>{system_prompt}<|end_header_id|>" # TAIDE llama3
- for msg in formatted_messages:
- if msg['role'] == 'user':
- # prompt += f"{msg['content']} [/INST]" # TAIDE llama2
- prompt += f"<|eot_id|><|start_header_id|>{msg['content']}<|end_header_id|>" # TAIDE llama3
- elif msg['role'] == "assistant":
- # prompt += f"{msg['content']} </s><s>[INST]" # TAIDE llama2
- prompt += f"<|eot_id|><|start_header_id|>{msg['content']}<|end_header_id|>" # TAIDE llama3
- command = ["docker", "exec", "-it", "ollama", "ollama", "run", self.model_name, prompt]
- result = subprocess.run(command, capture_output=True, text=True)
- if result.returncode != 0:
- raise Exception(f"Ollama command failed: {result.stderr}")
-
- content = result.stdout.strip()
- message = AIMessage(content=content)
- generation = ChatGeneration(message=message)
- return ChatResult(generations=[generation])
-
- @property
- def _llm_type(self) -> str:
- return "ollama-chat-model"
-
- # taide_llm = OllamaChatModel(model_name="taide-local-llama2")
- if __name__ == "__main__":
- question = ""
- while question.lower() != "exit":
- question = input("Question: ")
- # 溫室氣體是什麼?
- for function in [ollama_, huggingface_, huggingface2_, openai_]:
- start = time.time()
- llm = function()
- answer = llm.invoke(question)
- print(answer)
- processing_time = time.time() - start
- print(processing_time)
|