chromadb_generate.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. import os
  2. from dotenv import load_dotenv
  3. from langchain_openai import OpenAIEmbeddings
  4. from langchain_community.document_loaders.csv_loader import CSVLoader
  5. from langchain_chroma import Chroma
  6. import openai
  7. # Load environment variables
  8. load_dotenv('../environment.env')
  9. # Set up OpenAI API
  10. openai_api_key = os.getenv("OPENAI_API_KEY")
  11. if not openai_api_key:
  12. raise ValueError("No OpenAI API key found in environment variables")
  13. openai.api_key = openai_api_key
  14. # Initialize embeddings model
  15. embeddings_model = OpenAIEmbeddings()
  16. def extract_field(doc, field_name):
  17. for line in doc.page_content.split('\n'):
  18. if line.startswith(f"{field_name}:"):
  19. return line.split(':', 1)[1].strip()
  20. return None
  21. # Check if Chroma DB already exists
  22. if not os.path.exists("./chroma_db"):
  23. try:
  24. # Load and process CSV data
  25. loader = CSVLoader(file_path="log_record_rows.csv")
  26. data = loader.load()
  27. field_name = "question"
  28. questions = [extract_field(doc, field_name) for doc in data]
  29. # Create and save Chroma vector store
  30. vectorstore = Chroma.from_texts(
  31. texts=questions,
  32. embedding=embeddings_model,
  33. persist_directory="./chroma_db"
  34. )
  35. print("Chroma database created successfully.")
  36. except Exception as e:
  37. print(f"An error occurred while creating the Chroma database: {e}")
  38. else:
  39. print("Chroma database already exists.")