소스 검색

add sql examples and adjust prompt

ling 4 달 전
부모
커밋
1348a079ba
1개의 변경된 파일90개의 추가작업 그리고 20개의 파일을 삭제
  1. 90 20
      text_to_sql_private.py

+ 90 - 20
text_to_sql_private.py

@@ -46,8 +46,10 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 ##########################################################################################
 ##########################################################################################
 from langchain_community.chat_models import ChatOllama
 from langchain_community.chat_models import ChatOllama
 # local_llm = "llama3-groq-tool-use:latest"
 # local_llm = "llama3-groq-tool-use:latest"
-local_llm = "llama3-groq-tool-use:latest"
-llm = ChatOllama(model=local_llm, temperature=0)
+# local_llm = "llama3-groq-tool-use:latest"
+# local_llm = "sqlcoder:latest"
+# local_llm = "llama3.1:8b-instruct-q2_K"
+# llm = ChatOllama(model=local_llm, temperature=0)
 ##########################################################################################
 ##########################################################################################
 # model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 # model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 # tokenizer = AutoTokenizer.from_pretrained(model_id)
 # tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -73,57 +75,110 @@ llm = ChatOllama(model=local_llm, temperature=0)
 # llm = HuggingFacePipeline(pipeline=pipe)
 # llm = HuggingFacePipeline(pipeline=pipe)
 
 
 # llm = Ollama(model = "llama3-groq-tool-use:latest", num_gpu=1)
 # llm = Ollama(model = "llama3-groq-tool-use:latest", num_gpu=1)
+from langchain_openai import ChatOpenAI
+llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
+
 def get_examples():
 def get_examples():
     examples = [
     examples = [
         {
         {
-            "input": "建準廣興廠去年的自產電力的綠電使用量是多少?",
+            "input": "建準去年固定燃燒總排放量",
+            "query": """SELECT SUM("排放量(公噸CO2e)") AS "固定燃燒總排放量"
+                        FROM "建準碳排放清冊數據new"
+                        WHERE "事業名稱" like '%建準%'
+                        AND ("類別項目" like '%固定燃燒%' OR "排放源" like '%固定燃燒%')
+                        AND "盤查標準" = 'GHG'
+                        AND "年度" = EXTRACT(YEAR FROM CURRENT_DATE)-1;""",
+        },
+        {
+            "input": "廣興廠去年的固定燃燒排放量是多少?",
+            "query": """FROM "建準碳排放清冊數據new"
+                        WHERE "事業名稱" like '%建準%'
+                        AND "據點" = '昆山廣興廠'
+                        AND ("類別項目" like '%固定燃燒%' OR "排放源" like '%固定燃燒%')
+                        AND "盤查標準" = 'GHG'
+                        AND "年度" = EXTRACT(YEAR FROM CURRENT_DATE)-1;""",
+        },
+        {
+            "input": "建準廣興廠去年自產電力的綠電使用量是多少?",
             "query": """SELECT SUM("用電度數(kwh)") AS "綠電使用量"
             "query": """SELECT SUM("用電度數(kwh)") AS "綠電使用量"
                         FROM "用電度數"
                         FROM "用電度數"
                         WHERE "項目" like '%綠電%'
                         WHERE "項目" like '%綠電%'
                         AND "事業名稱" like '%建準%'
                         AND "事業名稱" like '%建準%'
-                        AND "事業名稱" like '%廣興廠%'
+                        AND "據點" = '昆山廣興廠'
                         AND "盤查標準" = 'GHG'
                         AND "盤查標準" = 'GHG'
                         AND "年度" = EXTRACT(YEAR FROM CURRENT_DATE)-1;""",
                         AND "年度" = EXTRACT(YEAR FROM CURRENT_DATE)-1;""",
         },
         },
         {
         {
-            "input": "建準北海廠去年的類別1總排放量是多少?",
+            "input": "建準北海廠去年的類別1總排放量",
             "query": """SELECT SUM("排放量(公噸CO2e)") AS "類別1總排放量"
             "query": """SELECT SUM("排放量(公噸CO2e)") AS "類別1總排放量"
                         FROM "建準碳排放清冊數據new"
                         FROM "建準碳排放清冊數據new"
                         WHERE "事業名稱" like '%建準%'
                         WHERE "事業名稱" like '%建準%'
-                        AND "事業名稱" like '%北海%'
+                        AND "據點" in ('北海建準廠', '北海立準廠')
                         AND "類別" = '類別1'
                         AND "類別" = '類別1'
                         AND "盤查標準" = 'GHG'
                         AND "盤查標準" = 'GHG'
                         AND "年度" = EXTRACT(YEAR FROM CURRENT_DATE)-1;""",
                         AND "年度" = EXTRACT(YEAR FROM CURRENT_DATE)-1;""",
         },
         },
         {
         {
             "input": "建準廣興廠去年的直接排放總排放量是多少?",
             "input": "建準廣興廠去年的直接排放總排放量是多少?",
-            "query": """SELECT SUM("排放量(公噸CO2e)") AS "類別1總排放量"
+            "query": """SELECT SUM("排放量(公噸CO2e)") AS "直接排放總排放量"
                         FROM "建準碳排放清冊數據new"
                         FROM "建準碳排放清冊數據new"
                         WHERE "事業名稱" like '%建準%'
                         WHERE "事業名稱" like '%建準%'
-                        AND "事業名稱" like '%廣興%'
+                        AND "據點" = '昆山廣興廠'
                         AND ("類別項目" like '%直接排放%' OR "排放源" like '%直接排放%')
                         AND ("類別項目" like '%直接排放%' OR "排放源" like '%直接排放%')
                         AND "盤查標準" = 'GHG'
                         AND "盤查標準" = 'GHG'
                         AND "年度" = EXTRACT(YEAR FROM CURRENT_DATE)-1;""",
                         AND "年度" = EXTRACT(YEAR FROM CURRENT_DATE)-1;""",
         },
         },
         {
         {
             "input": "建準台北辦事處2022年的類別2總排放量是多少?",
             "input": "建準台北辦事處2022年的類別2總排放量是多少?",
-            "query": """SELECT SUM("排放量(公噸CO2e)") AS "直接排放總排放量"
+            "query": """SELECT SUM("排放量(公噸CO2e)") AS "類別2總排放量"
                         FROM "建準碳排放清冊數據new"
                         FROM "建準碳排放清冊數據new"
                         WHERE "事業名稱" like '%建準%'
                         WHERE "事業名稱" like '%建準%'
-                        AND "事業名稱" like '%台北%'
+                        AND "據點" = '台北辦事處'
                         AND "類別" = '類別2'
                         AND "類別" = '類別2'
                         AND "盤查標準" = 'GHG'
                         AND "盤查標準" = 'GHG'
                         AND "年度" = 2022;""",
                         AND "年度" = 2022;""",
         },
         },
         {
         {
-            "input": "建準去年的固定燃燒總排放量是多少?",
-            "query": """SELECT SUM("排放量(公噸CO2e)") AS "固定燃燒總排放量"
+            "input": "建準法國廠2022年的類別2總排放量",
+            "query": """SELECT SUM("排放量(公噸CO2e)") AS "類別2總排放量"
                         FROM "建準碳排放清冊數據new"
                         FROM "建準碳排放清冊數據new"
                         WHERE "事業名稱" like '%建準%'
                         WHERE "事業名稱" like '%建準%'
-                        AND ("類別項目" like '%固定燃燒%' OR "排放源" like '%固定燃燒%')
+                        AND "國家" = '法國'
+                        AND "類別" = '類別2'
                         AND "盤查標準" = 'GHG'
                         AND "盤查標準" = 'GHG'
-                        AND "年度" = EXTRACT(YEAR FROM CURRENT_DATE)-1;""",
+                        AND "年度" = 2022;""",
+        },
+        {
+            "input": "建準北海2022的外購電力是多少",
+            "query": """SELECT SUM("用電度數(kwh)") AS "外購電力"
+                        FROM "用電度數"
+                        WHERE "事業名稱" like '%建準%'
+                        AND "據點" in ('北海建準廠', '北海立準廠')
+                        AND "項目" like '%外購電力%'
+                        AND "盤查標準" = 'GHG'
+                        AND "年度" = 2022;""",
+        },
+        {
+            "input": "2023建準印度的其他間接排放是多少",
+            "query": """SELECT SUM("排放量(公噸CO2e)") AS "其他間接排放總量"
+                        FROM "建準碳排放清冊數據new"
+                        WHERE "事業名稱" like '%建準%'
+                        AND "國家" = '印度'
+                        AND ("類別項目" like '%其他間接排放%' OR "排放源" like '%其他間接排放%')
+                        AND "盤查標準" = 'GHG'
+                        AND "年度" = 2023;""",
         },
         },
+        {
+            "input": "建準台北前年的產品使用碳排放量是多少",
+            "query": """SELECT SUM("排放量(公噸CO2e)") AS "產品使用總量"
+                        FROM "建準碳排放清冊數據new"
+                        WHERE "事業名稱" like '%建準%'
+                        AND "據點" = '台北辦事處'
+                        AND ("類別項目" like '%產品使用%' OR "排放源" like '%產品使用%')
+                        AND "盤查標準" = 'GHG'
+                        AND "年度" = EXTRACT(YEAR FROM CURRENT_DATE)-2;""",
+        },
+
 
 
 
 
     ]
     ]
@@ -137,7 +192,8 @@ def table_description():
         "The `建準碳排放清冊數據new` table 描述了建準電機工業股份有限公司不同據點分別在 ISO 14064-1:2018 與 GHG Protocol 標準下的溫室氣體排放量,並依類別1至類別6劃分。"
         "The `建準碳排放清冊數據new` table 描述了建準電機工業股份有限公司不同據點分別在 ISO 14064-1:2018 與 GHG Protocol 標準下的溫室氣體排放量,並依類別1至類別6劃分。"
         "It includes the following columns:\n"
         "It includes the following columns:\n"
         "- `年度`: 盤查年度\n"
         "- `年度`: 盤查年度\n"
-        "- `事業名稱`: 建準據點"
+        "- `事業名稱`: 公司名稱"
+        "- `據點`: 建準廠房據點 include '高雄總部及運通廠', '台北辦事處', '昆山廣興廠', '北海建準廠', '北海立準廠', '菲律賓建準廠', 'Inc', 'SAS', 'India'"
         "- `國家`: 據點所在國家"
         "- `國家`: 據點所在國家"
         "- `類別`: 溫室氣體的排放類別,包含以下選項:\n"
         "- `類別`: 溫室氣體的排放類別,包含以下選項:\n"
         "   \t*類別1-直接排放:\n"
         "   \t*類別1-直接排放:\n"
@@ -183,7 +239,9 @@ def write_query_chain(db, llm):
     <|begin_of_text|>
     <|begin_of_text|>
     
     
     <|start_header_id|>system<|end_header_id|>
     <|start_header_id|>system<|end_header_id|>
+
     Generate a SQL query to answer this question: `{input}`
     Generate a SQL query to answer this question: `{input}`
+    你是建準的AI助理,幫助建準查詢碳排放量,如果問題中有提到據點廠房,請使用 PostgreSQL query 進行篩選。
 
 
     You are a PostgreSQL expert in ESG field. Given an input question, first create a syntactically correct PostgreSQL query to run, 
     You are a PostgreSQL expert in ESG field. Given an input question, first create a syntactically correct PostgreSQL query to run, 
     then look at the results of the query and return the answer to the input question.\n\
     then look at the results of the query and return the answer to the input question.\n\
@@ -192,6 +250,7 @@ def write_query_chain(db, llm):
     Never query for all columns from a table. You must query only the columns that are needed to answer the question. 
     Never query for all columns from a table. You must query only the columns that are needed to answer the question. 
     Wrap each column name in  Quotation Mark (") to denote them as delimited identifiers.\n\
     Wrap each column name in  Quotation Mark (") to denote them as delimited identifiers.\n\
     
     
+    Unless the user ask for the type of 盤查標準 to be 'ISO' or 'GHG', queries always include query "盤查標準"='GHG' in the WHERE clause.\n  
     ***Pay attention to only return query for PostgreSQL WITHOUT "```sql", And DO NOT content any other words.\n\
     ***Pay attention to only return query for PostgreSQL WITHOUT "```sql", And DO NOT content any other words.\n\
     ***Pay attention to only return PostgreSQL query and no premable or explanation.\n\
     ***Pay attention to only return PostgreSQL query and no premable or explanation.\n\
     <|eot_id|>
     <|eot_id|>
@@ -209,9 +268,9 @@ def write_query_chain(db, llm):
     Below are a number of examples of questions and their corresponding SQL queries.\n\
     Below are a number of examples of questions and their corresponding SQL queries.\n\
     
     
     <|eot_id|>
     <|eot_id|>
-    
-    <|start_header_id|>assistant<|end_header_id|>
+    SQL query:
     """
     """
+    # <|start_header_id|>assistant<|end_header_id|>
     # prompt_template = PromptTemplate.from_template(template)
     # prompt_template = PromptTemplate.from_template(template)
 
 
     example_prompt = PromptTemplate.from_template("The following SQL query best answers the question `{input}`\nSQL query: {query}")
     example_prompt = PromptTemplate.from_template("The following SQL query best answers the question `{input}`\nSQL query: {query}")
@@ -227,6 +286,7 @@ def write_query_chain(db, llm):
     # llm = HuggingFacePipeline(pipeline=pipe)
     # llm = HuggingFacePipeline(pipeline=pipe)
     
     
     
     
+    # sqlcoder = Ollama(model = "sqlcoder", num_gpu=1)
     write_query = create_sql_query_chain(llm, db, prompt)
     write_query = create_sql_query_chain(llm, db, prompt)
 
 
 
 
@@ -245,11 +305,11 @@ def sql_to_nl_chain(llm):
         ** 請務必在回答中表達是建準的資料,即便問句中並未提及建準。
         ** 請務必在回答中表達是建準的資料,即便問句中並未提及建準。
         
         
         The following shows some example:
         The following shows some example:
-        Question: 廣興廠去年的類別1總排放量是多少?
+        Question: 建準廣興廠去年的類別1總排放量是多少?
         SQL Query: SELECT SUM("排放量(公噸CO2e)") AS "類別1總排放量"
         SQL Query: SELECT SUM("排放量(公噸CO2e)") AS "類別1總排放量"
                         FROM "建準碳排放清冊數據new"
                         FROM "建準碳排放清冊數據new"
                         WHERE "事業名稱" like '%建準%'
                         WHERE "事業名稱" like '%建準%'
-                        AND "事業名稱" like '%廣興%'
+                        AND "據點" = '昆山廣興廠'
                         AND "類別" = '類別1'
                         AND "類別" = '類別1'
                         AND "盤查標準" = 'GHG'
                         AND "盤查標準" = 'GHG'
                         AND "年度" = EXTRACT(YEAR FROM CURRENT_DATE)-1;,
                         AND "年度" = EXTRACT(YEAR FROM CURRENT_DATE)-1;,
@@ -257,6 +317,7 @@ def sql_to_nl_chain(llm):
         Answer: 建準廣興廠去年的類別1總排放量是1102.3712
         Answer: 建準廣興廠去年的類別1總排放量是1102.3712
 
 
         如果你不知道答案或SQL query 出現錯誤請回答:"很抱歉,目前我無法回答您的問題,請將您的詢問發送至 test@systex.com 以便獲得更進一步的幫助,謝謝。"
         如果你不知道答案或SQL query 出現錯誤請回答:"很抱歉,目前我無法回答您的問題,請將您的詢問發送至 test@systex.com 以便獲得更進一步的幫助,謝謝。"
+        
         勿回答無關資訊
         勿回答無關資訊
         <|eot_id|>
         <|eot_id|>
 
 
@@ -278,10 +339,14 @@ def sql_to_nl_chain(llm):
     return chain
     return chain
 
 
 def get_query(db, question, selected_table, llm):
 def get_query(db, question, selected_table, llm):
+    
     write_query = write_query_chain(db, llm)
     write_query = write_query_chain(db, llm)
     query = write_query.invoke({"question": question, 'table_names_to_use': selected_table, "top_k": 1000, "table_info":context["table_info"], "database_description": table_description()})
     query = write_query.invoke({"question": question, 'table_names_to_use': selected_table, "top_k": 1000, "table_info":context["table_info"], "database_description": table_description()})
     
     
     query = re.split('SQL query: ', query)[-1]
     query = re.split('SQL query: ', query)[-1]
+    query = query.replace("```sql","").replace("```","")
+    query = query.replace("碰排","碳排")
+    query = query.replace("%%","%")
     # query = query.replace("104_112碰排放公開及建準資料","104_112碳排放公開及建準資料")
     # query = query.replace("104_112碰排放公開及建準資料","104_112碳排放公開及建準資料")
     print(query)
     print(query)
     
     
@@ -308,6 +373,9 @@ def run(db, question, selected_table, llm):
     query = write_query.invoke({"question": question, 'table_names_to_use': selected_table, "top_k": 1000, "table_info":context["table_info"], "database_description": table_description()})
     query = write_query.invoke({"question": question, 'table_names_to_use': selected_table, "top_k": 1000, "table_info":context["table_info"], "database_description": table_description()})
     
     
     query = re.split('SQL query: ', query)[-1]
     query = re.split('SQL query: ', query)[-1]
+    query = query.replace("```sql","").replace("```","")
+    query = query.replace("碰排","碳排")
+    query = query.replace("%%","%")
     # query = query.replace("104_112碰排放公開及建準資料","104_112碳排放公開及建準資料")
     # query = query.replace("104_112碰排放公開及建準資料","104_112碳排放公開及建準資料")
     print(query)
     print(query)
 
 
@@ -327,7 +395,9 @@ if __name__ == "__main__":
     start = time.time()
     start = time.time()
     
     
     selected_table = ['用水度數', '用水度數', '建準碳排放清冊數據new']
     selected_table = ['用水度數', '用水度數', '建準碳排放清冊數據new']
-    question = "建準去年的上游運輸總排放量是多少?"
+    # question = "建準廣興廠去年的上游運輸總排放量是多少?"
+    question = "建準北海廠去年的固定燃燒排放量是多少?"
+    # question = "建準北海廠去年類別1總排放量是多少?"
     # question = "台積電2022年的直接排放總排放量是多少?"
     # question = "台積電2022年的直接排放總排放量是多少?"
     # question = "建準廣興廠去年的灰電使用量"
     # question = "建準廣興廠去年的灰電使用量"
     query, result, answer = run(db, question, selected_table, llm)
     query, result, answer = run(db, question, selected_table, llm)