urlToarticle.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. # -*- coding: utf-8 -*-
  2. from selenium.webdriver.common.by import By
  3. from selenium.webdriver.common.keys import Keys
  4. from selenium.webdriver.support.ui import WebDriverWait
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from langchain_openai import ChatOpenAI
  7. from langchain_core.output_parsers import StrOutputParser
  8. from langchain_community.callbacks import get_openai_callback
  9. from langchain_core.prompts import ChatPromptTemplate
  10. import time
  11. import undetected_chromedriver as uc
  12. from dotenv import load_dotenv
  13. import os
  14. import logging
  15. # 加载环境变量
  16. load_dotenv()
  17. # 紀錄程式執行狀況
  18. logging.basicConfig(level=logging.INFO)
  19. class ArticleGenerator:
  20. def __init__(self, user_data_dir, profile_directory):
  21. self.user_data_dir = user_data_dir
  22. self.profile_directory = profile_directory
  23. self.driver = None
  24. self.OPEN_API_KEY: str = os.getenv('OPEN_API_KEY')
  25. def get_webdriver(self):
  26. for attempt in range(3): # 嘗試最多 3 次
  27. try:
  28. options = uc.ChromeOptions()
  29. # options.add_argument("--disable-blink-features=AutomationControlled")
  30. # options.add_experimental_option("excludeSwitches", ["enable-automation"])
  31. # options.add_experimental_option("useAutomationExtension", False)
  32. options.add_argument('--ignore-certificate-errors')
  33. options.add_argument("--disable-gpu")
  34. options.add_argument("--disable-dev-shm-usage")
  35. # options.add_argument("headless")
  36. options.add_argument(f"user-data-dir={self.user_data_dir}")
  37. options.add_argument(f'--profile-directory={self.profile_directory}')
  38. # s = Service(self.driver_path)
  39. self.driver=uc.Chrome(options=options, version_main=132, use_subprocess=True)
  40. self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
  41. return self.driver
  42. except Exception as e:
  43. print(f'Error: {e}')
  44. print(f"WebDriver 啟動失敗,第 {attempt + 1} 次嘗試...")
  45. if attempt == 2:
  46. raise e
  47. time.sleep(2) # 等待 2 秒後重試
  48. # AI問答 - 取得 openai 回應
  49. def ttm(self, article, language='繁體中文'):
  50. with get_openai_callback() as cb:
  51. model_name = "gpt-4o"
  52. llm = ChatOpenAI(model_name=model_name, temperature=0.7, api_key=self.OPEN_API_KEY, max_tokens=4096)
  53. qa_system_prompt = f"""你是一個專門做 md 格式文章的AI助理.
  54. 可以從一大串文字整理出高質量的 md 格式文章. 多餘的內容拿掉,只需要回傳文章內容即可.
  55. 輸出的語言為{language}."""
  56. qa_prompt = ChatPromptTemplate.from_messages(
  57. [
  58. ("system", qa_system_prompt),
  59. ("human", "{question}"),
  60. ]
  61. )
  62. rag_chain = (
  63. qa_prompt
  64. | llm
  65. | StrOutputParser()
  66. )
  67. text = rag_chain.invoke(
  68. {"question": article+"請轉成 md 格式"}
  69. )
  70. print(f"Total Tokens: {cb.total_tokens}")
  71. print(f"Prompt Tokens: {cb.prompt_tokens}")
  72. print(f"Completion Tokens: {cb.completion_tokens}")
  73. print(f"Total Cost (USD): ${cb.total_cost}")
  74. return text
  75. def generate_article(self, urls):
  76. for attempt in range(3):
  77. try:
  78. driver = self.get_webdriver()
  79. url = "https://notebooklm.google.com/"
  80. driver.get(url)
  81. time.sleep(3)
  82. # 新建專案
  83. new_created = driver.find_element(By.XPATH, '/html/body/labs-tailwind-root/div/welcome-page/div/div[2]/div[1]/div/button/span[2]')
  84. while not new_created:
  85. time.sleep(0.1)
  86. new_created = driver.find_element(By.XPATH, '/html/body/labs-tailwind-root/div/welcome-page/div/div[2]/div[1]/div/button/span[2]')
  87. new_created.click()
  88. time.sleep(2)
  89. # # 點擊 專案
  90. # notebooks = driver.find_element(By.CLASS_NAME, 'project-buttons-flow ng-star-inserted')
  91. # # 選取第一個 project
  92. # new_notebook = notebooks.find_element(By.TAG_NAME, 'project-button')
  93. # while not new_notebook:
  94. # time.sleep(0.1)
  95. # new_notebook = notebooks.find_element(By.TAG_NAME, 'project-button')
  96. # new_notebook.click()
  97. time.sleep(1)
  98. # 將網址都輸入
  99. for i, url in enumerate(urls):
  100. time.sleep(5)
  101. if i == 0:
  102. upload_urls = driver.find_element(By.CSS_SELECTOR, '.mat-icon.notranslate.upload-icon.google-symbols.mat-icon-no-color')
  103. driver.execute_script("arguments[0].scrollIntoView(true);", upload_urls)
  104. while not upload_urls:
  105. time.sleep(0.1)
  106. upload_urls = driver.find_element(By.CSS_SELECTOR, '.mat-icon.notranslate.upload-icon.google-symbols.mat-icon-no-color')
  107. driver.execute_script("arguments[0].scrollIntoView(true);", upload_urls)
  108. upload_urls.click()
  109. time.sleep(3)
  110. # 找網站按鈕
  111. text_click = driver.find_element(By.XPATH, '//*[@id="mat-mdc-chip-2"]/span[2]/span/span[2]')
  112. driver.execute_script("arguments[0].scrollIntoView(true);", text_click)
  113. while not text_click:
  114. time.sleep(0.1)
  115. text_click = driver.find_element(By.XPATH, '//*[@id="mat-mdc-chip-2"]/span[2]/span/span[2]')
  116. driver.execute_script("arguments[0].scrollIntoView(true);", text_click)
  117. text_click.click()
  118. time.sleep(1)
  119. # 找到 textarea 並輸入網址
  120. website_input = driver.find_element(By.ID, 'mat-input-0')
  121. while not text_click:
  122. time.sleep(0.1)
  123. website_input = driver.find_element(By.ID, 'mat-input-0')
  124. website_input.send_keys(url)
  125. website_input.send_keys(Keys.RETURN)
  126. print(f'成功輸入網址: {url}')
  127. time.sleep(7)
  128. else:
  129. try:
  130. WebDriverWait(driver, 20).until(
  131. EC.presence_of_element_located((By.CLASS_NAME, "mdc-button__label"))
  132. )
  133. 新增來源 = driver.find_elements(By.CLASS_NAME, "mdc-button__label")[2]
  134. print(新增來源.is_displayed())
  135. while not 新增來源:
  136. time.sleep(0.1)
  137. 新增來源 = driver.find_element(By.CLASS_NAME, "mdc-button__label")[2]
  138. 新增來源.click()
  139. except Exception as e:
  140. print(f'Error: {e}')
  141. # 嘗試使用 JavaScript 來檢查元素是否可用
  142. driver.execute_script("return document.querySelector('button #mat-tab-group-0-content-0 > div > div > div > source-picker > div > div.ng-tns-c2551705568-5.ng-star-inserted > button > span.mat-mdc-button-persistent-ripple.mdc-button__ripple') != null")
  143. time.sleep(3)
  144. # 找網站按鈕
  145. try:
  146. # 取得整個按鈕列表
  147. clicks = driver.find_element(By.CLASS_NAME, 'chip-groups').find_elements(By.CSS_SELECTOR, '.chip-group.ng-star-inserted')
  148. print(clicks)
  149. text_click = clicks[1].find_element(By.TAG_NAME, 'mat-chip')
  150. while not text_click:
  151. time.sleep(0.1)
  152. text_click = clicks[1].find_element(By.TAG_NAME, 'mat-chip')
  153. text_click.click()
  154. except Exception as e:
  155. print(f'Error: {e}')
  156. # 如果捕獲到錯誤,使用 JavaScript 強制點擊
  157. try:
  158. text_click = clicks[1].find_element(By.TAG_NAME, 'mat-chip')
  159. driver.execute_script("arguments[0].click();", text_click) # 使用 JS 點擊元素
  160. print("使用 JavaScript 成功點擊元素")
  161. except Exception as js_error:
  162. print(f'JavaScript 點擊錯誤: {js_error}')
  163. time.sleep(2)
  164. # 找到 textarea 並輸入網址
  165. # 取得上面幾層
  166. div = driver.find_element(By.TAG_NAME, 'website-upload')
  167. # print(div.text)
  168. website_input = div.find_element(By.TAG_NAME, 'mat-form-field').find_element(By.TAG_NAME, 'input')
  169. # 確保元素可以操作
  170. while not website_input.is_displayed() or not website_input.is_enabled():
  171. time.sleep(0.1)
  172. website_input = div.find_element(By.TAG_NAME, 'mat-form-field').find_element(By.TAG_NAME, 'input')
  173. try:
  174. website_input.send_keys(url)
  175. website_input.send_keys(Keys.RETURN)
  176. except Exception as e:
  177. print(f'Error: {e}')
  178. # 使用 JavaScript 強制發送鍵盤事件
  179. driver.execute_script("arguments[0].value = arguments[1];", website_input, url) # 將文字輸入到 input
  180. driver.execute_script("arguments[0].dispatchEvent(new Event('input'));", website_input) # 觸發 input 事件
  181. print(f'成功輸入網址: {url}')
  182. time.sleep(7)
  183. time.sleep(5)
  184. # 上面幾層
  185. omnibar = driver.find_element(By.TAG_NAME, 'chat-panel').find_element(By.TAG_NAME, 'omnibar')
  186. box = omnibar.find_element(By.TAG_NAME, 'query-box')
  187. prompt_input = box.find_element(By.TAG_NAME, 'textarea')
  188. while not prompt_input.is_displayed() or not prompt_input.is_enabled():
  189. time.sleep(0.1)
  190. prompt_input = box.find_element(By.TAG_NAME, 'textarea')
  191. j = 1
  192. fore_content = ''
  193. # while '##' not in fore_content:
  194. # if j > 5:
  195. # break
  196. # 輸入生成文章的 prompt
  197. # complete_prompt = "1. 彙整並生成一篇以常見問題為主的 MarkDown 格式的 專業文章(給客戶看的,最後一句「希望這篇文章能解答...」整句直接刪掉)。2. 文章必須為 MarkDown 格式,也就是必須包含#、##等等。3. 在各個分類或重要段落中加入一些具體的案例或數據。4. 必須使用段落格式,不要條列式。5. 若內容不是中文,要精準翻譯成中文。6. 生成 MarkDown 的 md 檔(須包含大標題及各個段落的標題)。"
  198. complete_prompt = "彙整並生成一篇以常見問題為主的 MarkDown 格式的 專業文章。在各個分類或重要段落中加入一些具體的案例或數據。5. 若內容不是中文,要精準翻譯成中文。"
  199. try:
  200. prompt_input.send_keys(complete_prompt)
  201. prompt_input.send_keys(Keys.RETURN)
  202. except Exception as e:
  203. print(f'Error: {e}')
  204. # 使用 JavaScript 強制發送鍵盤事件
  205. driver.execute_script("arguments[0].value = arguments[1];", prompt_input, complete_prompt) # 將文字輸入到 input
  206. driver.execute_script("arguments[0].dispatchEvent(new Event('input'));", prompt_input) # 觸發 input 事件
  207. print('等候答案')
  208. time.sleep(40) # 等待答案生成
  209. message = driver.find_element(By.TAG_NAME, 'chat-panel').find_elements(By.TAG_NAME, 'chat-message')[j]
  210. while not message.is_displayed() or not message.is_enabled():
  211. time.sleep(0.1)
  212. message = driver.find_element(By.TAG_NAME, 'chat-panel').find_elements(By.TAG_NAME, 'chat-message')[j]
  213. # answers = message.find_elements(By.TAG_NAME, 'labs-tailwind-structural-element-view-v2')
  214. # while not answers[1].is_displayed() or not answers[1].is_enabled():
  215. # time.sleep(0.1)
  216. # answers = message.find_elements(By.TAG_NAME, 'labs-tailwind-structural-element-view-v2')
  217. # fore_content = message.text
  218. # j+=2
  219. # 直接生成 md 檔 或是 生成文字?
  220. # texts = []
  221. # if '#' not in answers[0].text:
  222. # for i, answer in enumerate(answers[1:]):
  223. # if '##' not in fore_content:
  224. # if i == 0:
  225. # texts.append('# ' + answer.text.strip())
  226. # elif i % 2 == 1 and i != len(answers[1:])-1:
  227. # texts.append('## ' + answer.text.strip())
  228. # elif i == len(answers[1:])-1:
  229. # continue
  230. # else:
  231. # texts.append(answer.text.strip())
  232. # else:
  233. # if '# # ' in answer.text:
  234. # t = answer.text.replace('# # ', '# ').strip()
  235. # texts.append(t)
  236. # else:
  237. # texts.append(answer.text.strip())
  238. # else:
  239. # for i, answer in enumerate(answers):
  240. # if '##' not in fore_content:
  241. # if i == 0:
  242. # texts.append('# ' + answer.text.strip())
  243. # elif i % 2 == 1 and i != len(answers[1:])-1:
  244. # texts.append('## ' + answer.text.strip())
  245. # elif i == len(answers[1:])-1:
  246. # continue
  247. # else:
  248. # texts.append(answer.text.strip())
  249. # else:
  250. # if '# # ' in answer.text:
  251. # t = answer.text.replace('# # ', '# ').strip()
  252. # texts.append(t)
  253. # else:
  254. # texts.append(answer.text.strip())
  255. # print(texts)
  256. # if len(texts) > 1:
  257. # content = '\n'.join(texts)
  258. # else:
  259. # content = texts[0].replace('# # ', '# ')
  260. # print(content)
  261. text = self.ttm(message.text)
  262. is_succesed = self.save_article_as_md(text)
  263. if is_succesed:
  264. return 'article.md 生成成功'
  265. else:
  266. return '存檔失敗'
  267. except Exception as e:
  268. print("出現錯誤: ", str(e))
  269. self.driver.quit()
  270. self.driver = None
  271. if self.driver:
  272. time.sleep(5)
  273. self.driver.quit()
  274. return False
  275. def save_article_as_md(self, content, filename="./article.md"):
  276. # 打開或創建一個 .md 文件
  277. try:
  278. with open(filename, 'w+', encoding='utf-8-sig') as file:
  279. # 將文章內容寫入文件
  280. file.write(content)
  281. print(f"文章已成功保存為 {filename}")
  282. return True
  283. except Exception as e:
  284. print(f"保存文章時發生錯誤: {str(e)}")
  285. return False
  286. # 假設生成的文章內容如下(這會是你的生成內容)
  287. # 測試區域(僅在此模組直接執行時執行)
  288. if __name__ == "__main__":
  289. # driver_path = 'chromedriver-win32/chromedriver.exe'
  290. user_data_dir = 'C:/Users/wangy/AppData/Local/Google/Chrome/User Data'
  291. profile_directory = 'Profile 20'
  292. # 範例用法
  293. article_generator = ArticleGenerator(user_data_dir, profile_directory)
  294. urls = [
  295. "https://zh.wikipedia.org/zh-tw/%E8%99%9B%E6%93%AC%E4%BA%BA",
  296. "https://www.naipo.com/Portals/1/web_tw/Knowledge_Center/Industry_Insight/IPNC_240515_1501.htm",
  297. "https://money.udn.com/money/story/11162/8333646",
  298. "https://gitmind.com/tw/digital-human-creator.html",
  299. "https://support.microsoft.com/zh-tw/office/%E5%9C%A8-microsoft-teams-%E4%B8%AD%E4%BB%A5%E8%99%9B%E6%93%AC%E4%BA%BA%E5%81%B6%E7%9A%84%E5%BD%A2%E5%BC%8F%E5%8A%A0%E5%85%A5%E6%9C%83%E8%AD%B0-5384e7b7-30c7-4bcb-8065-0c9e830cc8ad",
  300. "https://digitaldomain.com/%E8%99%9B%E6%93%AC%E4%BA%BA%E5%AF%A6%E9%A9%97%E5%AE%A4/?lang=zh-hant",
  301. "https://www.cdri.org.tw/xcdoc/cont?xsmsid=0H270572678476094046&sid=0N149542836021459905",
  302. "https://learn.microsoft.com/zh-tw/azure/ai-services/speech-service/text-to-speech-avatar/what-is-text-to-speech-avatar",
  303. "https://www.xfyun.cn/services/VirtualHumans",
  304. "https://www.bnext.com.tw/article/65449/virtual-human-subculture"
  305. ]
  306. is_succesed = article_generator.generate_article(urls)
  307. if is_succesed:
  308. print(is_succesed)
  309. # import notebookllm
  310. # # 設置包含網址的 prompt
  311. # prompt = "請分析這個網址內容並轉成 md 格式文章: https://zh.wikipedia.org/zh-tw/%E8%99%9B%E6%93%AC%E4%BA%BA"
  312. # # 發送 prompt 給模型
  313. # response = notebookllm.Notebook()
  314. # # 顯示模型的回應
  315. # print(response)