urlToarticle.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. # -*- coding: utf-8 -*-
  2. from selenium.webdriver.common.by import By
  3. from selenium.webdriver.common.keys import Keys
  4. from selenium.webdriver.support.ui import WebDriverWait
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from langchain_openai import ChatOpenAI
  7. from langchain_core.output_parsers import StrOutputParser
  8. from langchain_community.callbacks import get_openai_callback
  9. from langchain_core.prompts import ChatPromptTemplate
  10. import time
  11. import undetected_chromedriver as uc
  12. from dotenv import load_dotenv
  13. import os
  14. import logging
  15. from pyvirtualdisplay import Display
  16. # 加载环境变量
  17. load_dotenv()
  18. # 紀錄程式執行狀況
  19. logging.basicConfig(level=logging.INFO)
  20. class ArticleGenerator:
  21. def __init__(self, user_data_dir, profile_directory, keyword):
  22. self.user_data_dir = user_data_dir
  23. self.profile_directory = profile_directory
  24. self.driver = None
  25. self.keyword = keyword
  26. self.OPEN_API_KEY: str = os.getenv('OPEN_API_KEY')
  27. def get_webdriver(self):
  28. os.system("pkill -f chrome")
  29. for attempt in range(3): # 嘗試最多 3 次
  30. try:
  31. display = Display(visible=0, size=(1920, 1080))
  32. display.start()
  33. options = uc.ChromeOptions()
  34. options.add_argument('--disable-setuid-sandbox')
  35. options.add_argument("--no-sandbox")
  36. options.add_argument("--disable-dev-shm-usage")
  37. options.add_argument("--disable-blink-features=AutomationControlled")
  38. #options.add_argument(f"--user-data-dir={self.user_data_dir}") # Linux Profile
  39. #options.add_argument(f'--profile-directory=Profile1')
  40. options.add_argument(f"--user-data-dir=/home/ling/.config/google-chrome/Profile2")
  41. self.driver=uc.Chrome(options=options,version_main=132)
  42. self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
  43. return self.driver
  44. except Exception as e:
  45. print(f'Error: {e}')
  46. print(f"WebDriver 啟動失敗,第 {attempt + 1} 次嘗試...")
  47. if attempt == 2:
  48. raise e
  49. time.sleep(2) # 等待 2 秒後重試
  50. # AI問答 - 取得 openai 回應
  51. def ttm(self, article, language='繁體中文'):
  52. with get_openai_callback() as cb:
  53. model_name = "gpt-4o"
  54. llm = ChatOpenAI(model_name=model_name, temperature=0.7, api_key=self.OPEN_API_KEY, max_tokens=4096)
  55. qa_system_prompt = f"""你是一個專門做 md 格式文章的AI助理.
  56. 可以從一大串文字整理出高質量的 md 格式文章. 多餘的內容拿掉,只需要回傳文章內容即可.
  57. 輸出的語言為{language}."""
  58. qa_prompt = ChatPromptTemplate.from_messages(
  59. [
  60. ("system", qa_system_prompt),
  61. ("human", "{question}"),
  62. ]
  63. )
  64. rag_chain = (
  65. qa_prompt
  66. | llm
  67. | StrOutputParser()
  68. )
  69. text = rag_chain.invoke(
  70. {"question": article+"請轉成 md 格式"}
  71. )
  72. print(f"Total Tokens: {cb.total_tokens}")
  73. print(f"Prompt Tokens: {cb.prompt_tokens}")
  74. print(f"Completion Tokens: {cb.completion_tokens}")
  75. print(f"Total Cost (USD): ${cb.total_cost}")
  76. return text
  77. def generate_article(self, urls):
  78. for attempt in range(3):
  79. try:
  80. driver = self.get_webdriver()
  81. url = "https://notebooklm.google.com/"
  82. driver.get(url)
  83. time.sleep(3)
  84. print('notebook這裡',driver.current_url)
  85. # 新建專案
  86. new_created = driver.find_element(By.XPATH, '/html/body/labs-tailwind-root/div/welcome-page/div/div[2]/div[1]/div/button/span[2]')
  87. while not new_created:
  88. time.sleep(0.1)
  89. new_created = driver.find_element(By.XPATH, '/html/body/labs-tailwind-root/div/welcome-page/div/div[2]/div[1]/div/button/span[2]')
  90. new_created.click()
  91. time.sleep(2)
  92. # # 點擊 專案
  93. # notebooks = driver.find_element(By.CLASS_NAME, 'project-buttons-flow ng-star-inserted')
  94. # # 選取第一個 project
  95. # new_notebook = notebooks.find_element(By.TAG_NAME, 'project-button')
  96. # while not new_notebook:
  97. # time.sleep(0.1)
  98. # new_notebook = notebooks.find_element(By.TAG_NAME, 'project-button')
  99. # new_notebook.click()
  100. time.sleep(1)
  101. # 將網址都輸入
  102. for i, url in enumerate(urls):
  103. time.sleep(5)
  104. if i == 0:
  105. # 找網站按鈕
  106. text_click = driver.find_element(By.XPATH, '//*[@id="mat-mdc-chip-2"]/span[2]/span/span[2]')
  107. print('這裡',text_click)
  108. time.sleep(1)
  109. driver.execute_script("arguments[0].scrollIntoView(true);", text_click)
  110. while not text_click:
  111. time.sleep(1)
  112. text_click = driver.find_element(By.XPATH, '//*[@id="mat-mdc-chip-2"]/span[2]/span/span[2]')
  113. driver.execute_script("arguments[0].scrollIntoView(true);", text_click)
  114. driver.execute_script("arguments[0].click();", text_click)
  115. # text_click.click()
  116. time.sleep(1)
  117. # 找到 textarea 並輸入網址
  118. website_input = driver.find_element(By.ID, 'mat-input-0')
  119. while not text_click:
  120. time.sleep(0.1)
  121. website_input = driver.find_element(By.ID, 'mat-input-0')
  122. website_input.send_keys(url)
  123. website_input.send_keys(Keys.RETURN)
  124. print(f'成功輸入網址: {url}')
  125. time.sleep(7)
  126. else:
  127. try:
  128. WebDriverWait(driver, 20).until(
  129. EC.presence_of_element_located((By.CLASS_NAME, "mdc-button__label"))
  130. )
  131. 新增來源 = driver.find_elements(By.CLASS_NAME, "mdc-button__label")[2]
  132. print(新增來源.is_displayed())
  133. while not 新增來源:
  134. time.sleep(0.1)
  135. 新增來源 = driver.find_element(By.CLASS_NAME, "mdc-button__label")[2]
  136. 新增來源.click()
  137. except Exception as e:
  138. print(f'Error: {e}')
  139. # 嘗試使用 JavaScript 來檢查元素是否可用
  140. driver.execute_script("return document.querySelector('button #mat-tab-group-0-content-0 > div > div > div > source-picker > div > div.ng-tns-c2551705568-5.ng-star-inserted > button > span.mat-mdc-button-persistent-ripple.mdc-button__ripple') != null")
  141. time.sleep(3)
  142. # 找網站按鈕
  143. try:
  144. # 取得整個按鈕列表
  145. clicks = driver.find_element(By.CLASS_NAME, 'chip-groups').find_elements(By.CSS_SELECTOR, '.chip-group.ng-star-inserted')
  146. print(clicks)
  147. text_click = clicks[1].find_element(By.TAG_NAME, 'mat-chip')
  148. while not text_click:
  149. time.sleep(0.1)
  150. text_click = clicks[1].find_element(By.TAG_NAME, 'mat-chip')
  151. text_click.click()
  152. except Exception as e:
  153. print(f'Error: {e}')
  154. # 如果捕獲到錯誤,使用 JavaScript 強制點擊
  155. try:
  156. text_click = clicks[1].find_element(By.TAG_NAME, 'mat-chip')
  157. driver.execute_script("arguments[0].click();", text_click) # 使用 JS 點擊元素
  158. print("使用 JavaScript 成功點擊元素")
  159. except Exception as js_error:
  160. print(f'JavaScript 點擊錯誤: {js_error}')
  161. time.sleep(2)
  162. # 找到 textarea 並輸入網址
  163. # 取得上面幾層
  164. div = driver.find_element(By.TAG_NAME, 'website-upload')
  165. # print(div.text)
  166. website_input = div.find_element(By.TAG_NAME, 'mat-form-field').find_element(By.TAG_NAME, 'input')
  167. # 確保元素可以操作
  168. while not website_input.is_displayed() or not website_input.is_enabled():
  169. time.sleep(0.1)
  170. website_input = div.find_element(By.TAG_NAME, 'mat-form-field').find_element(By.TAG_NAME, 'input')
  171. try:
  172. website_input.send_keys(url)
  173. website_input.send_keys(Keys.RETURN)
  174. except Exception as e:
  175. print(f'Error: {e}')
  176. # 使用 JavaScript 強制發送鍵盤事件
  177. driver.execute_script("arguments[0].value = arguments[1];", website_input, url) # 將文字輸入到 input
  178. driver.execute_script("arguments[0].dispatchEvent(new Event('input'));", website_input) # 觸發 input 事件
  179. print(f'成功輸入網址: {url}')
  180. time.sleep(7)
  181. time.sleep(5)
  182. # 上面幾層
  183. omnibar = driver.find_element(By.TAG_NAME, 'chat-panel').find_element(By.TAG_NAME, 'omnibar')
  184. box = omnibar.find_element(By.TAG_NAME, 'query-box')
  185. prompt_input = box.find_element(By.TAG_NAME, 'textarea')
  186. while not prompt_input.is_displayed() or not prompt_input.is_enabled():
  187. time.sleep(0.1)
  188. prompt_input = box.find_element(By.TAG_NAME, 'textarea')
  189. j = 1
  190. fore_content = ''
  191. # while '##' not in fore_content:
  192. # if j > 5:
  193. # break
  194. # 輸入生成文章的 prompt
  195. # complete_prompt = "1. 彙整並生成一篇以常見問題為主的 MarkDown 格式的 專業文章(給客戶看的,最後一句「希望這篇文章能解答...」整句直接刪掉)。2. 文章必須為 MarkDown 格式,也就是必須包含#、##等等。3. 在各個分類或重要段落中加入一些具體的案例或數據。4. 必須使用段落格式,不要條列式。5. 若內容不是中文,要精準翻譯成中文。6. 生成 MarkDown 的 md 檔(須包含大標題及各個段落的標題)。"
  196. #complete_prompt = "彙整並生成一篇以常見問題為主的 MarkDown 格式的 專業文章。在各個分類或重要段落中加入一些具體的案例或數據。5. 若內容不是中文,要精準翻譯成中文。"
  197. complete_prompt = f"1. 彙整並生成一篇以常見問題為主的 MarkDown 格式的 專業文章(給客戶看的,最後一句「希望這篇文章能解答...」整句直接刪掉)。2. 文章必須出現多次這組字:{self.keyword}。3. 在各個分類或重要段落中加入一些具體的案例或數據。4. 必須使用段落格式,不要條列式。5. 若內容不是中文,要精準翻譯成中文。6. 生成 MarkDown 的 md 檔(須包含大標題及各個段落的標題)。"
  198. try:
  199. prompt_input.send_keys(complete_prompt)
  200. prompt_input.send_keys(Keys.RETURN)
  201. except Exception as e:
  202. print(f'Error: {e}')
  203. # 使用 JavaScript 強制發送鍵盤事件
  204. driver.execute_script("arguments[0].value = arguments[1];", prompt_input, complete_prompt) # 將文字輸入到 input
  205. driver.execute_script("arguments[0].dispatchEvent(new Event('input'));", prompt_input) # 觸發 input 事件
  206. print('等候答案')
  207. time.sleep(40) # 等待答案生成
  208. message = driver.find_element(By.TAG_NAME, 'chat-panel').find_elements(By.TAG_NAME, 'chat-message')[j]
  209. while not message.is_displayed() or not message.is_enabled():
  210. time.sleep(0.1)
  211. message = driver.find_element(By.TAG_NAME, 'chat-panel').find_elements(By.TAG_NAME, 'chat-message')[j]
  212. # answers = message.find_elements(By.TAG_NAME, 'labs-tailwind-structural-element-view-v2')
  213. # while not answers[1].is_displayed() or not answers[1].is_enabled():
  214. # time.sleep(0.1)
  215. # answers = message.find_elements(By.TAG_NAME, 'labs-tailwind-structural-element-view-v2')
  216. # fore_content = message.text
  217. # j+=2
  218. # 直接生成 md 檔 或是 生成文字?
  219. # texts = []
  220. # if '#' not in answers[0].text:
  221. # for i, answer in enumerate(answers[1:]):
  222. # if '##' not in fore_content:
  223. # if i == 0:
  224. # texts.append('# ' + answer.text.strip())
  225. # elif i % 2 == 1 and i != len(answers[1:])-1:
  226. # texts.append('## ' + answer.text.strip())
  227. # elif i == len(answers[1:])-1:
  228. # continue
  229. # else:
  230. # texts.append(answer.text.strip())
  231. # else:
  232. # if '# # ' in answer.text:
  233. # t = answer.text.replace('# # ', '# ').strip()
  234. # texts.append(t)
  235. # else:
  236. # texts.append(answer.text.strip())
  237. # else:
  238. # for i, answer in enumerate(answers):
  239. # if '##' not in fore_content:
  240. # if i == 0:
  241. # texts.append('# ' + answer.text.strip())
  242. # elif i % 2 == 1 and i != len(answers[1:])-1:
  243. # texts.append('## ' + answer.text.strip())
  244. # elif i == len(answers[1:])-1:
  245. # continue
  246. # else:
  247. # texts.append(answer.text.strip())
  248. # else:
  249. # if '# # ' in answer.text:
  250. # t = answer.text.replace('# # ', '# ').strip()
  251. # texts.append(t)
  252. # else:
  253. # texts.append(answer.text.strip())
  254. # print(texts)
  255. # if len(texts) > 1:
  256. # content = '\n'.join(texts)
  257. # else:
  258. # content = texts[0].replace('# # ', '# ')
  259. # print(content)
  260. text = self.ttm(message.text)
  261. is_succesed = self.save_article_as_md(text)
  262. if is_succesed:
  263. return 'article.md 生成成功'
  264. else:
  265. return '存檔失敗'
  266. except Exception as e:
  267. print("出現錯誤: ", str(e))
  268. self.driver.quit()
  269. self.driver = None
  270. if self.driver:
  271. time.sleep(5)
  272. self.driver.quit()
  273. return False
  274. def save_article_as_md(self, content, filename="./article.md"):
  275. # 打開或創建一個 .md 文件
  276. try:
  277. with open(filename, 'w+', encoding='utf-8-sig') as file:
  278. # 將文章內容寫入文件
  279. file.write(content)
  280. print(f"文章已成功保存為 {filename}")
  281. return True
  282. except Exception as e:
  283. print(f"保存文章時發生錯誤: {str(e)}")
  284. return False
  285. # 假設生成的文章內容如下(這會是你的生成內容)
  286. # 測試區域(僅在此模組直接執行時執行)
  287. if __name__ == "__main__":
  288. # driver_path = 'chromedriver-win32/chromedriver.exe'
  289. user_data_dir = 'C:/Users/wangy/AppData/Local/Google/Chrome/User Data'
  290. profile_directory = 'Profile 20'
  291. # 範例用法
  292. article_generator = ArticleGenerator(user_data_dir, profile_directory)
  293. urls = [
  294. "https://zh.wikipedia.org/zh-tw/%E8%99%9B%E6%93%AC%E4%BA%BA",
  295. "https://www.naipo.com/Portals/1/web_tw/Knowledge_Center/Industry_Insight/IPNC_240515_1501.htm",
  296. "https://money.udn.com/money/story/11162/8333646",
  297. "https://gitmind.com/tw/digital-human-creator.html",
  298. "https://support.microsoft.com/zh-tw/office/%E5%9C%A8-microsoft-teams-%E4%B8%AD%E4%BB%A5%E8%99%9B%E6%93%AC%E4%BA%BA%E5%81%B6%E7%9A%84%E5%BD%A2%E5%BC%8F%E5%8A%A0%E5%85%A5%E6%9C%83%E8%AD%B0-5384e7b7-30c7-4bcb-8065-0c9e830cc8ad",
  299. "https://digitaldomain.com/%E8%99%9B%E6%93%AC%E4%BA%BA%E5%AF%A6%E9%A9%97%E5%AE%A4/?lang=zh-hant",
  300. "https://www.cdri.org.tw/xcdoc/cont?xsmsid=0H270572678476094046&sid=0N149542836021459905",
  301. "https://learn.microsoft.com/zh-tw/azure/ai-services/speech-service/text-to-speech-avatar/what-is-text-to-speech-avatar",
  302. "https://www.xfyun.cn/services/VirtualHumans",
  303. "https://www.bnext.com.tw/article/65449/virtual-human-subculture"
  304. ]
  305. is_succesed = article_generator.generate_article(urls)
  306. if is_succesed:
  307. print(is_succesed)
  308. # import notebookllm
  309. # # 設置包含網址的 prompt
  310. # prompt = "請分析這個網址內容並轉成 md 格式文章: https://zh.wikipedia.org/zh-tw/%E8%99%9B%E6%93%AC%E4%BA%BA"
  311. # # 發送 prompt 給模型
  312. # response = notebookllm.Notebook()
  313. # # 顯示模型的回應
  314. # print(response)