123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346 |
- # -*- coding: utf-8 -*-
- from selenium.webdriver.common.by import By
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- from langchain_openai import ChatOpenAI
- from langchain_core.output_parsers import StrOutputParser
- from langchain_community.callbacks import get_openai_callback
- from langchain_core.prompts import ChatPromptTemplate
- import time
- import undetected_chromedriver as uc
- from dotenv import load_dotenv
- import os
- import logging
- # 加载环境变量
- load_dotenv()
- # 紀錄程式執行狀況
- logging.basicConfig(level=logging.INFO)
- class ArticleGenerator:
- def __init__(self, user_data_dir, profile_directory):
- self.user_data_dir = user_data_dir
- self.profile_directory = profile_directory
- self.driver = None
- self.OPEN_API_KEY: str = os.getenv('OPEN_API_KEY')
- def get_webdriver(self):
- for attempt in range(3): # 嘗試最多 3 次
- try:
- options = uc.ChromeOptions()
- # options.add_argument("--disable-blink-features=AutomationControlled")
- # options.add_experimental_option("excludeSwitches", ["enable-automation"])
- # options.add_experimental_option("useAutomationExtension", False)
- options.add_argument('--ignore-certificate-errors')
- options.add_argument("--disable-gpu")
- options.add_argument("--disable-dev-shm-usage")
- # options.add_argument("headless")
- options.add_argument(f"user-data-dir={self.user_data_dir}")
- options.add_argument(f'--profile-directory={self.profile_directory}')
- # s = Service(self.driver_path)
- self.driver=uc.Chrome(options=options, version_main=132, use_subprocess=True)
- self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
- return self.driver
- except Exception as e:
- print(f'Error: {e}')
- print(f"WebDriver 啟動失敗,第 {attempt + 1} 次嘗試...")
- if attempt == 2:
- raise e
- time.sleep(2) # 等待 2 秒後重試
- # AI問答 - 取得 openai 回應
- def ttm(self, article, language='繁體中文'):
- with get_openai_callback() as cb:
- model_name = "gpt-4o"
- llm = ChatOpenAI(model_name=model_name, temperature=0.7, api_key=self.OPEN_API_KEY, max_tokens=4096)
- qa_system_prompt = f"""你是一個專門做 md 格式文章的AI助理.
- 可以從一大串文字整理出高質量的 md 格式文章. 多餘的內容拿掉,只需要回傳文章內容即可.
- 輸出的語言為{language}."""
- qa_prompt = ChatPromptTemplate.from_messages(
- [
- ("system", qa_system_prompt),
- ("human", "{question}"),
- ]
- )
- rag_chain = (
- qa_prompt
- | llm
- | StrOutputParser()
- )
- text = rag_chain.invoke(
- {"question": article+"請轉成 md 格式"}
- )
- print(f"Total Tokens: {cb.total_tokens}")
- print(f"Prompt Tokens: {cb.prompt_tokens}")
- print(f"Completion Tokens: {cb.completion_tokens}")
- print(f"Total Cost (USD): ${cb.total_cost}")
- return text
- def generate_article(self, urls):
- for attempt in range(3):
- try:
- driver = self.get_webdriver()
- url = "https://notebooklm.google.com/"
- driver.get(url)
- time.sleep(3)
-
- # 新建專案
- new_created = driver.find_element(By.XPATH, '/html/body/labs-tailwind-root/div/welcome-page/div/div[2]/div[1]/div/button/span[2]')
- while not new_created:
- time.sleep(0.1)
- new_created = driver.find_element(By.XPATH, '/html/body/labs-tailwind-root/div/welcome-page/div/div[2]/div[1]/div/button/span[2]')
- new_created.click()
- time.sleep(2)
-
- # # 點擊 專案
- # notebooks = driver.find_element(By.CLASS_NAME, 'project-buttons-flow ng-star-inserted')
- # # 選取第一個 project
- # new_notebook = notebooks.find_element(By.TAG_NAME, 'project-button')
- # while not new_notebook:
- # time.sleep(0.1)
- # new_notebook = notebooks.find_element(By.TAG_NAME, 'project-button')
- # new_notebook.click()
- time.sleep(1)
- # 將網址都輸入
- for i, url in enumerate(urls):
- time.sleep(5)
- if i == 0:
- upload_urls = driver.find_element(By.CSS_SELECTOR, '.mat-icon.notranslate.upload-icon.google-symbols.mat-icon-no-color')
- driver.execute_script("arguments[0].scrollIntoView(true);", upload_urls)
- while not upload_urls:
- time.sleep(0.1)
- upload_urls = driver.find_element(By.CSS_SELECTOR, '.mat-icon.notranslate.upload-icon.google-symbols.mat-icon-no-color')
- driver.execute_script("arguments[0].scrollIntoView(true);", upload_urls)
- upload_urls.click()
- time.sleep(3)
- # 找網站按鈕
- text_click = driver.find_element(By.XPATH, '//*[@id="mat-mdc-chip-2"]/span[2]/span/span[2]')
- driver.execute_script("arguments[0].scrollIntoView(true);", text_click)
- while not text_click:
- time.sleep(0.1)
- text_click = driver.find_element(By.XPATH, '//*[@id="mat-mdc-chip-2"]/span[2]/span/span[2]')
- driver.execute_script("arguments[0].scrollIntoView(true);", text_click)
- text_click.click()
- time.sleep(1)
- # 找到 textarea 並輸入網址
- website_input = driver.find_element(By.ID, 'mat-input-0')
- while not text_click:
- time.sleep(0.1)
- website_input = driver.find_element(By.ID, 'mat-input-0')
- website_input.send_keys(url)
- website_input.send_keys(Keys.RETURN)
- print(f'成功輸入網址: {url}')
- time.sleep(7)
- else:
- try:
- WebDriverWait(driver, 20).until(
- EC.presence_of_element_located((By.CLASS_NAME, "mdc-button__label"))
- )
- 新增來源 = driver.find_elements(By.CLASS_NAME, "mdc-button__label")[2]
- print(新增來源.is_displayed())
- while not 新增來源:
- time.sleep(0.1)
- 新增來源 = driver.find_element(By.CLASS_NAME, "mdc-button__label")[2]
- 新增來源.click()
- except Exception as e:
- print(f'Error: {e}')
- # 嘗試使用 JavaScript 來檢查元素是否可用
- driver.execute_script("return document.querySelector('button #mat-tab-group-0-content-0 > div > div > div > source-picker > div > div.ng-tns-c2551705568-5.ng-star-inserted > button > span.mat-mdc-button-persistent-ripple.mdc-button__ripple') != null")
- time.sleep(3)
- # 找網站按鈕
- try:
- # 取得整個按鈕列表
- clicks = driver.find_element(By.CLASS_NAME, 'chip-groups').find_elements(By.CSS_SELECTOR, '.chip-group.ng-star-inserted')
- print(clicks)
- text_click = clicks[1].find_element(By.TAG_NAME, 'mat-chip')
- while not text_click:
- time.sleep(0.1)
- text_click = clicks[1].find_element(By.TAG_NAME, 'mat-chip')
- text_click.click()
- except Exception as e:
- print(f'Error: {e}')
- # 如果捕獲到錯誤,使用 JavaScript 強制點擊
- try:
- text_click = clicks[1].find_element(By.TAG_NAME, 'mat-chip')
- driver.execute_script("arguments[0].click();", text_click) # 使用 JS 點擊元素
- print("使用 JavaScript 成功點擊元素")
- except Exception as js_error:
- print(f'JavaScript 點擊錯誤: {js_error}')
- time.sleep(2)
- # 找到 textarea 並輸入網址
- # 取得上面幾層
- div = driver.find_element(By.TAG_NAME, 'website-upload')
- # print(div.text)
- website_input = div.find_element(By.TAG_NAME, 'mat-form-field').find_element(By.TAG_NAME, 'input')
- # 確保元素可以操作
- while not website_input.is_displayed() or not website_input.is_enabled():
- time.sleep(0.1)
- website_input = div.find_element(By.TAG_NAME, 'mat-form-field').find_element(By.TAG_NAME, 'input')
- try:
- website_input.send_keys(url)
- website_input.send_keys(Keys.RETURN)
- except Exception as e:
- print(f'Error: {e}')
- # 使用 JavaScript 強制發送鍵盤事件
- driver.execute_script("arguments[0].value = arguments[1];", website_input, url) # 將文字輸入到 input
- driver.execute_script("arguments[0].dispatchEvent(new Event('input'));", website_input) # 觸發 input 事件
- print(f'成功輸入網址: {url}')
- time.sleep(7)
-
- time.sleep(5)
- # 上面幾層
- omnibar = driver.find_element(By.TAG_NAME, 'chat-panel').find_element(By.TAG_NAME, 'omnibar')
- box = omnibar.find_element(By.TAG_NAME, 'query-box')
- prompt_input = box.find_element(By.TAG_NAME, 'textarea')
- while not prompt_input.is_displayed() or not prompt_input.is_enabled():
- time.sleep(0.1)
- prompt_input = box.find_element(By.TAG_NAME, 'textarea')
-
- j = 1
- fore_content = ''
- # while '##' not in fore_content:
- # if j > 5:
- # break
- # 輸入生成文章的 prompt
- # complete_prompt = "1. 彙整並生成一篇以常見問題為主的 MarkDown 格式的 專業文章(給客戶看的,最後一句「希望這篇文章能解答...」整句直接刪掉)。2. 文章必須為 MarkDown 格式,也就是必須包含#、##等等。3. 在各個分類或重要段落中加入一些具體的案例或數據。4. 必須使用段落格式,不要條列式。5. 若內容不是中文,要精準翻譯成中文。6. 生成 MarkDown 的 md 檔(須包含大標題及各個段落的標題)。"
- complete_prompt = "彙整並生成一篇以常見問題為主的 MarkDown 格式的 專業文章。在各個分類或重要段落中加入一些具體的案例或數據。5. 若內容不是中文,要精準翻譯成中文。"
- try:
- prompt_input.send_keys(complete_prompt)
- prompt_input.send_keys(Keys.RETURN)
- except Exception as e:
- print(f'Error: {e}')
- # 使用 JavaScript 強制發送鍵盤事件
- driver.execute_script("arguments[0].value = arguments[1];", prompt_input, complete_prompt) # 將文字輸入到 input
- driver.execute_script("arguments[0].dispatchEvent(new Event('input'));", prompt_input) # 觸發 input 事件
- print('等候答案')
- time.sleep(40) # 等待答案生成
- message = driver.find_element(By.TAG_NAME, 'chat-panel').find_elements(By.TAG_NAME, 'chat-message')[j]
- while not message.is_displayed() or not message.is_enabled():
- time.sleep(0.1)
- message = driver.find_element(By.TAG_NAME, 'chat-panel').find_elements(By.TAG_NAME, 'chat-message')[j]
- # answers = message.find_elements(By.TAG_NAME, 'labs-tailwind-structural-element-view-v2')
- # while not answers[1].is_displayed() or not answers[1].is_enabled():
- # time.sleep(0.1)
- # answers = message.find_elements(By.TAG_NAME, 'labs-tailwind-structural-element-view-v2')
- # fore_content = message.text
- # j+=2
-
- # 直接生成 md 檔 或是 生成文字?
- # texts = []
- # if '#' not in answers[0].text:
- # for i, answer in enumerate(answers[1:]):
- # if '##' not in fore_content:
- # if i == 0:
- # texts.append('# ' + answer.text.strip())
- # elif i % 2 == 1 and i != len(answers[1:])-1:
- # texts.append('## ' + answer.text.strip())
- # elif i == len(answers[1:])-1:
- # continue
- # else:
- # texts.append(answer.text.strip())
- # else:
- # if '# # ' in answer.text:
- # t = answer.text.replace('# # ', '# ').strip()
- # texts.append(t)
- # else:
- # texts.append(answer.text.strip())
- # else:
- # for i, answer in enumerate(answers):
- # if '##' not in fore_content:
- # if i == 0:
- # texts.append('# ' + answer.text.strip())
- # elif i % 2 == 1 and i != len(answers[1:])-1:
- # texts.append('## ' + answer.text.strip())
- # elif i == len(answers[1:])-1:
- # continue
- # else:
- # texts.append(answer.text.strip())
- # else:
- # if '# # ' in answer.text:
- # t = answer.text.replace('# # ', '# ').strip()
- # texts.append(t)
- # else:
- # texts.append(answer.text.strip())
-
- # print(texts)
- # if len(texts) > 1:
- # content = '\n'.join(texts)
- # else:
- # content = texts[0].replace('# # ', '# ')
- # print(content)
- text = self.ttm(message.text)
- is_succesed = self.save_article_as_md(text)
- if is_succesed:
- return 'article.md 生成成功'
- else:
- return '存檔失敗'
- except Exception as e:
- print("出現錯誤: ", str(e))
- self.driver.quit()
- self.driver = None
- if self.driver:
- time.sleep(5)
- self.driver.quit()
- return False
-
- def save_article_as_md(self, content, filename="./article.md"):
- # 打開或創建一個 .md 文件
- try:
- with open(filename, 'w+', encoding='utf-8-sig') as file:
- # 將文章內容寫入文件
- file.write(content)
- print(f"文章已成功保存為 {filename}")
- return True
- except Exception as e:
- print(f"保存文章時發生錯誤: {str(e)}")
- return False
- # 假設生成的文章內容如下(這會是你的生成內容)
- # 測試區域(僅在此模組直接執行時執行)
- if __name__ == "__main__":
- # driver_path = 'chromedriver-win32/chromedriver.exe'
- user_data_dir = 'C:/Users/wangy/AppData/Local/Google/Chrome/User Data'
- profile_directory = 'Profile 20'
- # 範例用法
- article_generator = ArticleGenerator(user_data_dir, profile_directory)
- urls = [
- "https://zh.wikipedia.org/zh-tw/%E8%99%9B%E6%93%AC%E4%BA%BA",
- "https://www.naipo.com/Portals/1/web_tw/Knowledge_Center/Industry_Insight/IPNC_240515_1501.htm",
- "https://money.udn.com/money/story/11162/8333646",
- "https://gitmind.com/tw/digital-human-creator.html",
- "https://support.microsoft.com/zh-tw/office/%E5%9C%A8-microsoft-teams-%E4%B8%AD%E4%BB%A5%E8%99%9B%E6%93%AC%E4%BA%BA%E5%81%B6%E7%9A%84%E5%BD%A2%E5%BC%8F%E5%8A%A0%E5%85%A5%E6%9C%83%E8%AD%B0-5384e7b7-30c7-4bcb-8065-0c9e830cc8ad",
- "https://digitaldomain.com/%E8%99%9B%E6%93%AC%E4%BA%BA%E5%AF%A6%E9%A9%97%E5%AE%A4/?lang=zh-hant",
- "https://www.cdri.org.tw/xcdoc/cont?xsmsid=0H270572678476094046&sid=0N149542836021459905",
- "https://learn.microsoft.com/zh-tw/azure/ai-services/speech-service/text-to-speech-avatar/what-is-text-to-speech-avatar",
- "https://www.xfyun.cn/services/VirtualHumans",
- "https://www.bnext.com.tw/article/65449/virtual-human-subculture"
- ]
- is_succesed = article_generator.generate_article(urls)
- if is_succesed:
- print(is_succesed)
- # import notebookllm
- # # 設置包含網址的 prompt
- # prompt = "請分析這個網址內容並轉成 md 格式文章: https://zh.wikipedia.org/zh-tw/%E8%99%9B%E6%93%AC%E4%BA%BA"
- # # 發送 prompt 給模型
- # response = notebookllm.Notebook()
- # # 顯示模型的回應
- # print(response)
|