text_to_sql_private.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441
  1. import re
  2. from dotenv import load_dotenv
  3. load_dotenv()
  4. from langchain_community.utilities import SQLDatabase
  5. import os
  6. URI: str = os.environ.get('SUPABASE_URI')
  7. db = SQLDatabase.from_uri(URI)
  8. # print(db.dialect)
  9. # print(db.get_usable_table_names())
  10. # db.run('SELECT * FROM "2022 清冊數據(GHG)" LIMIT 10;')
  11. context = db.get_context()
  12. # print(list(context))
  13. # print(context["table_info"])
  14. from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate
  15. from langchain.chains import create_sql_query_chain
  16. from langchain_community.llms import Ollama
  17. from langchain_community.tools.sql_database.tool import QuerySQLDataBaseTool
  18. from operator import itemgetter
  19. from langchain_core.output_parsers import StrOutputParser
  20. from langchain_core.prompts import PromptTemplate
  21. from langchain_core.runnables import RunnablePassthrough
  22. # Load model directly
  23. from transformers import AutoTokenizer, AutoModelForCausalLM
  24. from transformers import AutoModelForCausalLM, AutoTokenizer,pipeline
  25. import torch
  26. from langchain_huggingface import HuggingFacePipeline
  27. # Load model directly
  28. from transformers import AutoTokenizer, AutoModelForCausalLM
  29. # model_id = "defog/llama-3-sqlcoder-8b"
  30. # tokenizer = AutoTokenizer.from_pretrained(model_id)
  31. # sql_llm = HuggingFacePipeline.from_model_id(
  32. # model_id=model_id,
  33. # task="text-generation",
  34. # model_kwargs={"torch_dtype": torch.bfloat16},
  35. # pipeline_kwargs={"return_full_text": False},
  36. # device=0, device_map='cuda')
  37. ##########################################################################################
  38. from langchain_community.chat_models import ChatOllama
  39. # local_llm = "llama3-groq-tool-use:latest"
  40. # local_llm = "llama3-groq-tool-use:latest"
  41. # local_llm = "sqlcoder:latest"
  42. # local_llm = "llama3.1:8b-instruct-q2_K"
  43. # llm = ChatOllama(model=local_llm, temperature=0)
  44. ##########################################################################################
  45. # model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
  46. # tokenizer = AutoTokenizer.from_pretrained(model_id)
  47. # llm = HuggingFacePipeline.from_model_id(
  48. # model_id=model_id,
  49. # task="text-generation",
  50. # model_kwargs={"torch_dtype": torch.bfloat16},
  51. # pipeline_kwargs={"return_full_text": False,
  52. # "max_new_tokens": 512},
  53. # device=0, device_map='cuda')
  54. # print(llm.pipeline)
  55. # llm.pipeline.tokenizer.pad_token_id = llm.pipeline.model.config.eos_token_id[0]
  56. ##########################################################################################
  57. # model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True)
  58. # pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=500, top_k=50, temperature=0.1,
  59. # model_kwargs={"torch_dtype": torch.bfloat16, "return_full_text": False})
  60. #, device="auto", load_in_4bit=True
  61. # llm = HuggingFacePipeline(pipeline=pipe)
  62. # llm = HuggingFacePipeline(pipeline=pipe)
  63. # llm = Ollama(model = "llama3-groq-tool-use:latest", num_gpu=1)
  64. from langchain_openai import ChatOpenAI
  65. llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
  66. def get_examples():
  67. examples = [
  68. {
  69. "input": "建準去年固定燃燒總排放量",
  70. "query": """SELECT SUM("排放量(公噸CO2e)") AS "固定燃燒總排放量"
  71. FROM "建準碳排放清冊數據new"
  72. WHERE "事業名稱" like '%建準%'
  73. AND ("類別項目" like '%固定燃燒%' OR "排放源" like '%固定燃燒%')
  74. AND "盤查標準" = 'GHG'
  75. AND "年度" = EXTRACT(YEAR FROM CURRENT_DATE)-1;""",
  76. },
  77. {
  78. "input": "廣興廠去年的固定燃燒排放量是多少?",
  79. "query": """FROM "建準碳排放清冊數據new"
  80. WHERE "事業名稱" like '%建準%'
  81. AND "據點" = '昆山廣興廠'
  82. AND ("類別項目" like '%固定燃燒%' OR "排放源" like '%固定燃燒%')
  83. AND "盤查標準" = 'GHG'
  84. AND "年度" = EXTRACT(YEAR FROM CURRENT_DATE)-1;""",
  85. },
  86. {
  87. "input": "建準廣興廠去年自產電力的綠電使用量是多少?",
  88. "query": """SELECT SUM("用電度數(kwh)") AS "綠電使用量"
  89. FROM "用電度數"
  90. WHERE "項目" like '%綠電%'
  91. AND "事業名稱" like '%建準%'
  92. AND "據點" = '昆山廣興廠'
  93. AND "盤查標準" = 'GHG'
  94. AND "年度" = EXTRACT(YEAR FROM CURRENT_DATE)-1;""",
  95. },
  96. {
  97. "input": "廣興廠2023綠電使用量",
  98. "query": """SELECT SUM("用電度數(kwh)") AS "綠電使用量"
  99. FROM "用電度數"
  100. WHERE "項目" like '%綠電%'
  101. AND "事業名稱" like '%建準%'
  102. AND "據點" = '昆山廣興廠'
  103. AND "盤查標準" = 'GHG'
  104. AND "年度" = 2023;""",
  105. },
  106. {
  107. "input": "北海廠去年的類別1總排放量",
  108. "query": """SELECT SUM("排放量(公噸CO2e)") AS "類別1總排放量"
  109. FROM "建準碳排放清冊數據new"
  110. WHERE "事業名稱" like '%建準%'
  111. AND "據點" in ('北海建準廠', '北海立準廠')
  112. AND "類別" = '類別1'
  113. AND "盤查標準" = 'GHG'
  114. AND "年度" = EXTRACT(YEAR FROM CURRENT_DATE)-1;""",
  115. },
  116. {
  117. "input": "廣興廠去年的直接排放總排放量是多少?",
  118. "query": """SELECT SUM("排放量(公噸CO2e)") AS "直接排放總排放量"
  119. FROM "建準碳排放清冊數據new"
  120. WHERE "事業名稱" like '%建準%'
  121. AND "據點" = '昆山廣興廠'
  122. AND ("類別項目" like '%直接排放%' OR "排放源" like '%直接排放%')
  123. AND "盤查標準" = 'GHG'
  124. AND "年度" = EXTRACT(YEAR FROM CURRENT_DATE)-1;""",
  125. },
  126. {
  127. "input": "建準台北辦事處2022年的類別2總排放量是多少?",
  128. "query": """SELECT SUM("排放量(公噸CO2e)") AS "類別2總排放量"
  129. FROM "建準碳排放清冊數據new"
  130. WHERE "事業名稱" like '%建準%'
  131. AND "據點" = '台北辦事處'
  132. AND "類別" = '類別2'
  133. AND "盤查標準" = 'GHG'
  134. AND "年度" = 2022;""",
  135. },
  136. {
  137. "input": "建準法國廠2022年的類別2總排放量",
  138. "query": """SELECT SUM("排放量(公噸CO2e)") AS "類別2總排放量"
  139. FROM "建準碳排放清冊數據new"
  140. WHERE "事業名稱" like '%建準%'
  141. AND "國家" = '法國'
  142. AND "類別" = '類別2'
  143. AND "盤查標準" = 'GHG'
  144. AND "年度" = 2022;""",
  145. },
  146. {
  147. "input": "建準北海2022的外購電力是多少",
  148. "query": """SELECT SUM("用電度數(kwh)") AS "外購電力"
  149. FROM "用電度數"
  150. WHERE "事業名稱" like '%建準%'
  151. AND "據點" in ('北海建準廠', '北海立準廠')
  152. AND "項目" like '%外購電力%'
  153. AND "盤查標準" = 'GHG'
  154. AND "年度" = 2022;""",
  155. },
  156. {
  157. "input": "2023建準印度的其他間接排放是多少",
  158. "query": """SELECT SUM("排放量(公噸CO2e)") AS "其他間接排放總量"
  159. FROM "建準碳排放清冊數據new"
  160. WHERE "事業名稱" like '%建準%'
  161. AND "國家" = '印度'
  162. AND ("類別項目" like '%其他間接排放%' OR "排放源" like '%其他間接排放%')
  163. AND "盤查標準" = 'GHG'
  164. AND "年度" = 2023;""",
  165. },
  166. {
  167. "input": "建準台北前年的產品使用碳排放量是多少",
  168. "query": """SELECT SUM("排放量(公噸CO2e)") AS "產品使用總量"
  169. FROM "建準碳排放清冊數據new"
  170. WHERE "事業名稱" like '%建準%'
  171. AND "據點" = '台北辦事處'
  172. AND ("類別項目" like '%產品使用%' OR "排放源" like '%產品使用%')
  173. AND "盤查標準" = 'GHG'
  174. AND "年度" = EXTRACT(YEAR FROM CURRENT_DATE)-2;""",
  175. },
  176. {
  177. "input": "建準去年範疇三排放量",
  178. "query": """SELECT SUM("排放量(公噸CO2e)") AS "範疇三排放量"
  179. FROM "建準碳排放清冊數據new"
  180. WHERE "事業名稱" like '%建準%'
  181. AND "範疇" = '範疇三'
  182. AND "盤查標準" = 'GHG'
  183. AND "年度" = EXTRACT(YEAR FROM CURRENT_DATE)-1;""",
  184. },
  185. ]
  186. return examples
  187. def table_description():
  188. database_description = (
  189. "The database consists of following table: `用水度數`, `用水度數`, `建準碳排放清冊數據new`."
  190. "This is a PostgreSQL database, so you need to use postgres-related queries.\n\n"
  191. "The `建準碳排放清冊數據new` table 描述了建準電機工業股份有限公司不同據點分別在 ISO 14064-1:2018 與 GHG Protocol 標準下的溫室氣體排放量,並依類別1至類別6劃分。"
  192. "It includes the following columns:\n"
  193. "- `年度`: 盤查年度\n"
  194. "- `事業名稱`: 公司名稱"
  195. "- `據點`: 建準廠房據點 include '高雄總部及運通廠', '台北辦事處', '昆山廣興廠', '北海建準廠', '北海立準廠', '菲律賓建準廠', 'Inc', 'SAS', 'India'"
  196. "- `國家`: 據點所在國家"
  197. "- `範疇`: 碳盤查中把溫室氣體排放源分成三大範疇"
  198. "- `類別`: 溫室氣體的排放類別,包含以下選項:\n"
  199. " \t*類別1-直接排放:\n"
  200. " \t*類別2-能源間接排放\n"
  201. " \t*類別3-運輸間接排放\n"
  202. " \t*類別4-組織使用產品間接排放\n"
  203. " \t*類別5-使用來自組織產品間接排放\n"
  204. " \t*類別6\n"
  205. "- `排放源`: 由`類別`欄位進一步劃分的細項,包含以下選項:`固定燃燒`, `移動燃燒`, `製程排放`, `逸散排放`, `土地利用`, "
  206. "`外購電力`, `外購能源`, `上游運輸`, `下游運輸`, `員工通勤`, `商務旅行`, `訪客運輸`, "
  207. "`購買產品`, `外購燃料及能資源`, `資本貨物`, `上游租賃`, `廢棄物處理`, `廢棄物清運`, `其他委外業務`, "
  208. "`產品加工`, `產品使用`, `產品最終處理`, `下游租賃`, `投資排放`, `其他`, `其他間接排放` \n"
  209. "- `排放量(公噸CO2e)`: 溫室氣體排放量\n"
  210. "- `盤查標準`: ISO or GHG\n"
  211. "The `用電度數` 描述了建準電機工業股份有限公司不同據點分別在 ISO 14064-1:2018 與 GHG Protocol 標準下的水電使用量。"
  212. "It includes the following columns:\n"
  213. "- `年度`: 盤查年度\n"
  214. "- `事業名稱`: 建準據點"
  215. "- `國家`: 據點所在國家"
  216. "- `項目`: 用電項目,包含以下:\n"
  217. " \t*外購電力(灰電): 灰電(火力發電、核能發電等)的外購電力度數(kwh)\n"
  218. " \t*外購電力(綠電): 綠電(太陽光電)的外購電力度數(kwh)\n"
  219. " \t*自產電力(綠電): 綠電(太陽光電)的自產電力度數(kwh)\n"
  220. "- `用電度數(kwh)`: 用電度數,單位為kwh\n"
  221. "- `盤查標準`: ISO or GHG\n"
  222. "The `用水度數` 描述了建準電機工業股份有限公司不同據點分別在 ISO 14064-1:2018 與 GHG Protocol 標準下的水電使用量。"
  223. "It includes the following columns:\n"
  224. "- `年度`: 盤查年度\n"
  225. "- `事業名稱`: 建準據點"
  226. "- `國家`: 據點所在國家"
  227. "- `自來水度數(立方公尺 m³)`: 用水度數,單位為m³\n"
  228. "- `盤查標準`: ISO or GHG\n"
  229. )
  230. return database_description
  231. def write_query_chain(db, llm):
  232. template = """
  233. <|begin_of_text|>
  234. <|start_header_id|>system<|end_header_id|>
  235. Generate a SQL query to answer this question: `{input}`
  236. 你是建準的AI助理,幫助建準查詢碳排放量,如果問題中有提到據點廠房,請使用 PostgreSQL query 進行篩選。
  237. You are a PostgreSQL expert in ESG field. Given an input question, first create a syntactically correct PostgreSQL query to run,
  238. then look at the results of the query and return the answer to the input question.\n\
  239. Unless the user specifies in the question a specific number of examples to obtain, query for at most {top_k} results using the LIMIT clause as per PostgreSQL.
  240. You can order the results to return the most informative data in the database.\n\
  241. Never query for all columns from a table. You must query only the columns that are needed to answer the question.
  242. Wrap each column name in Quotation Mark (") to denote them as delimited identifiers.\n\
  243. Unless the user ask for the type of 盤查標準 to be 'ISO' or 'GHG', queries always include query "盤查標準"='GHG' in the WHERE clause.\n
  244. ***Pay attention to only return query for PostgreSQL WITHOUT "```sql", And DO NOT content any other words.\n\
  245. ***Pay attention to only return PostgreSQL query and no premable or explanation.\n\
  246. <|eot_id|>
  247. <|begin_of_text|><|start_header_id|>user<|end_header_id|>
  248. DDL statements:
  249. {table_info}
  250. The following is a description of database. Please refer to the database description to give the correct WHERE statement in the PostgreSQL query.\
  251. In particular, the details of the `排放源` and `類別` columns.\n
  252. database description:
  253. {database_description}
  254. Provide ONLY PostgreSQL query and NO premable or explanation!
  255. Below are a number of examples of questions and their corresponding SQL queries.\n\
  256. <|eot_id|>
  257. SQL query:
  258. """
  259. # <|start_header_id|>assistant<|end_header_id|>
  260. # prompt_template = PromptTemplate.from_template(template)
  261. example_prompt = PromptTemplate.from_template("The following SQL query best answers the question `{input}`\nSQL query: {query}")
  262. prompt = FewShotPromptTemplate(
  263. examples=get_examples(),
  264. example_prompt=example_prompt,
  265. prefix=template,
  266. suffix="User input: {input}\nSQL query: ",
  267. input_variables=["input", "top_k", "table_info"],
  268. )
  269. # llm = Ollama(model = "sqlcoder", num_gpu=1)
  270. # llm = HuggingFacePipeline(pipeline=pipe)
  271. # sqlcoder = Ollama(model = "sqlcoder", num_gpu=1)
  272. write_query = create_sql_query_chain(llm, db, prompt)
  273. return write_query
  274. def sql_to_nl_chain(llm):
  275. # llm = Ollama(model = "llama3.1", num_gpu=1)
  276. # llm = Ollama(model = "llama3.1:8b-instruct-q2_K", num_gpu=1)
  277. # llm = Ollama(model = "llama3-groq-tool-use:latest", num_gpu=1)
  278. answer_prompt = PromptTemplate.from_template(
  279. """
  280. <|begin_of_text|>
  281. <|begin_of_text|><|start_header_id|>system<|end_header_id|>
  282. Given the following user question, corresponding SQL query, and SQL result, answer the user question.
  283. 根據使用者的問題、對應的 SQL 查詢和 SQL 結果,以繁體中文回答使用者問題。
  284. ** 請務必在回答中表達是建準的資料,即便問句中並未提及建準。
  285. 如果有單位,請回答時使用單位。
  286. The following shows some example:
  287. Question: 建準廣興廠去年的類別1總排放量是多少?
  288. SQL Query: SELECT SUM("排放量(公噸CO2e)") AS "類別1總排放量"
  289. FROM "建準碳排放清冊數據new"
  290. WHERE "事業名稱" like '%建準%'
  291. AND "據點" = '昆山廣興廠'
  292. AND "類別" = '類別1'
  293. AND "盤查標準" = 'GHG'
  294. AND "年度" = EXTRACT(YEAR FROM CURRENT_DATE)-1;,
  295. SQL Result: [(1102.3712,)]
  296. Answer: 建準廣興廠去年的類別1總排放量是1102.3712公噸CO2e
  297. 如果你不知道答案或SQL query 出現Error請回答:"很抱歉,目前我無法回答您的問題,請將您的詢問發送至 test@systex.com 以便獲得更進一步的幫助,謝謝。"
  298. 若 SQL Result 為 0 代表數據為0。
  299. 勿回答無關資訊
  300. <|eot_id|>
  301. <|begin_of_text|><|start_header_id|>user<|end_header_id|>
  302. Question: {question}
  303. SQL Query: {query}
  304. SQL Result: {result}
  305. Answer:
  306. <|eot_id|>
  307. <|start_header_id|>assistant<|end_header_id|>
  308. """
  309. )
  310. # llm = Ollama(model = "llama3-groq-tool-use:latest", num_gpu=1)
  311. chain = answer_prompt | llm | StrOutputParser()
  312. return chain
  313. def get_query(db, question, selected_table, llm):
  314. write_query = write_query_chain(db, llm)
  315. query = write_query.invoke({"question": question, 'table_names_to_use': selected_table, "top_k": 1000, "table_info":context["table_info"], "database_description": table_description()})
  316. # Regular expression pattern to extract SQL query
  317. sql_pattern = r'SELECT[\s\S]+?;'
  318. # Extract SQL query using re.search
  319. sql_query = re.search(sql_pattern, query)
  320. if sql_query:
  321. query = sql_query.group()
  322. # print(sql_query.group())
  323. else:
  324. print("No SQL query found.")
  325. query = re.split('SQL query: ', query)[-1]
  326. query = query.replace("```sql","").replace("```","")
  327. query = query.replace("碰排","碳排")
  328. query = query.replace("%%","%")
  329. # query = query.replace("104_112碰排放公開及建準資料","104_112碳排放公開及建準資料")
  330. print(query)
  331. execute_query = QuerySQLDataBaseTool(db=db)
  332. result = execute_query.invoke(query)
  333. print(result)
  334. return query, result
  335. def query_to_nl(question, query, result, llm):
  336. # execute_query = QuerySQLDataBaseTool(db=db)
  337. # result = execute_query.invoke(query)
  338. # print(result)
  339. chain = sql_to_nl_chain(llm)
  340. print(result)
  341. answer = chain.invoke({"question": question, "query": query, "result": result})
  342. return answer
  343. def run(db, question, selected_table, llm):
  344. write_query = write_query_chain(db, llm)
  345. query = write_query.invoke({"question": question, 'table_names_to_use': selected_table, "top_k": 1000, "table_info":context["table_info"], "database_description": table_description()})
  346. query = re.split('SQL query: ', query)[-1]
  347. query = query.replace("```sql","").replace("```","")
  348. query = query.replace("碰排","碳排")
  349. query = query.replace("%%","%")
  350. # query = query.replace("104_112碰排放公開及建準資料","104_112碳排放公開及建準資料")
  351. print(query)
  352. execute_query = QuerySQLDataBaseTool(db=db)
  353. result = execute_query.invoke(query)
  354. print(result)
  355. chain = sql_to_nl_chain(llm)
  356. answer = chain.invoke({"question": question, "query": query, "result": result})
  357. return query, result, answer
  358. if __name__ == "__main__":
  359. import time
  360. start = time.time()
  361. selected_table = ['用水度數', '用水度數', '建準碳排放清冊數據new']
  362. # question = "建準廣興廠去年的上游運輸總排放量是多少?"
  363. question = "建準北海廠去年的固定燃燒排放量是多少?"
  364. # question = "建準北海廠去年類別1總排放量是多少?"
  365. # question = "台積電2022年的直接排放總排放量是多少?"
  366. # question = "建準廣興廠去年的灰電使用量"
  367. query, result, answer = run(db, question, selected_table, llm)
  368. print("question: ", question)
  369. print("query: ", query)
  370. print("result: ", result)
  371. print("answer: ", answer)
  372. print(time.time()-start)