zooey
/
article_generate


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
							# -*- coding: utf-8 -*-
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_community.callbacks import get_openai_callback
from langchain_core.prompts import ChatPromptTemplate
import time
import undetected_chromedriver as uc
from dotenv import load_dotenv
import os
import logging
from pyvirtualdisplay import Display

# 加载环境变量
load_dotenv()
# 紀錄程式執行狀況
logging.basicConfig(level=logging.INFO)

class ArticleGenerator:
    def __init__(self, user_data_dir, profile_directory, keyword):
        self.user_data_dir = user_data_dir
        self.profile_directory = profile_directory
        self.driver = None
        self.keyword = keyword
        self.OPEN_API_KEY: str = os.getenv('OPEN_API_KEY')

    def get_webdriver(self):
        os.system("pkill -f chrome")
        for attempt in range(3):  # 嘗試最多 3 次
            try:
                display = Display(visible=0, size=(1920, 1080))
                display.start()
                options = uc.ChromeOptions()
                options.add_argument('--disable-setuid-sandbox')
                options.add_argument("--no-sandbox")
                options.add_argument("--disable-dev-shm-usage")
                options.add_argument("--disable-blink-features=AutomationControlled")
                #options.add_argument(f"--user-data-dir={self.user_data_dir}")  # Linux Profile
                #options.add_argument(f'--profile-directory=Profile1')
                options.add_argument(f"--user-data-dir=/home/ling/.config/google-chrome/Profile2")

                self.driver=uc.Chrome(options=options,version_main=132)
                self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
                return self.driver

            except Exception as e:
                print(f'Error: {e}')
                print(f"WebDriver 啟動失敗，第 {attempt + 1} 次嘗試...")
                if attempt == 2:
                    raise e
                time.sleep(2)  # 等待 2 秒後重試

    # AI問答 - 取得 openai 回應
    def ttm(self, article, language='繁體中文'):
        with get_openai_callback() as cb:
            model_name = "gpt-4o"
            llm = ChatOpenAI(model_name=model_name, temperature=0.7, api_key=self.OPEN_API_KEY, max_tokens=4096)
            qa_system_prompt = f"""你是一個專門做 md 格式文章的AI助理.
可以從一大串文字整理出高質量的 md 格式文章. 多餘的內容拿掉，只需要回傳文章內容即可.
輸出的語言為{language}."""

            qa_prompt = ChatPromptTemplate.from_messages(
                [
                    ("system", qa_system_prompt),
                    ("human", "{question}"),
                ]
            )

            rag_chain = (
                qa_prompt
                | llm
                | StrOutputParser()
            )

            text = rag_chain.invoke(
                {"question": article+"請轉成 md 格式"}
            )


        print(f"Total Tokens: {cb.total_tokens}")
        print(f"Prompt Tokens: {cb.prompt_tokens}")
        print(f"Completion Tokens: {cb.completion_tokens}")
        print(f"Total Cost (USD): ${cb.total_cost}")
        return text

    def generate_article(self, urls):
        for attempt in range(3):
            try:
                driver = self.get_webdriver()
                url = "https://notebooklm.google.com/"
                driver.get(url)
                time.sleep(3)
                print('notebook這裡',driver.current_url)
                # 新建專案 
                new_created = driver.find_element(By.XPATH, '/html/body/labs-tailwind-root/div/welcome-page/div/div[2]/div[1]/div/button/span[2]')
                while not new_created:
                    time.sleep(0.1)
                    new_created = driver.find_element(By.XPATH, '/html/body/labs-tailwind-root/div/welcome-page/div/div[2]/div[1]/div/button/span[2]')
                new_created.click()
                time.sleep(2)
                
                # # 點擊 專案
                # notebooks = driver.find_element(By.CLASS_NAME, 'project-buttons-flow ng-star-inserted')
                # # 選取第一個 project
                # new_notebook = notebooks.find_element(By.TAG_NAME, 'project-button')
                # while not new_notebook:
                #     time.sleep(0.1)
                #     new_notebook = notebooks.find_element(By.TAG_NAME, 'project-button')
                # new_notebook.click()

                time.sleep(1)

                # 將網址都輸入
                for i, url in enumerate(urls):
                    time.sleep(5)
                    if i == 0:
                        # 找網站按鈕
                        text_click = driver.find_element(By.XPATH, '//*[@id="mat-mdc-chip-2"]/span[2]/span/span[2]')
                        print('這裡',text_click)
                        time.sleep(1)
                        driver.execute_script("arguments[0].scrollIntoView(true);", text_click)
                        while not text_click:
                            time.sleep(1)
                            text_click = driver.find_element(By.XPATH, '//*[@id="mat-mdc-chip-2"]/span[2]/span/span[2]')
                            driver.execute_script("arguments[0].scrollIntoView(true);", text_click)
                        driver.execute_script("arguments[0].click();", text_click)
                        # text_click.click()
                        time.sleep(1)

                        # 找到 textarea 並輸入網址
                        website_input = driver.find_element(By.ID, 'mat-input-0')
                        while not text_click:
                            time.sleep(0.1)
                            website_input = driver.find_element(By.ID, 'mat-input-0')
                        website_input.send_keys(url)
                        website_input.send_keys(Keys.RETURN)
                        print(f'成功輸入網址: {url}')
                        time.sleep(7)
                    else:
                        try:
                            WebDriverWait(driver, 20).until(
                                EC.presence_of_element_located((By.CLASS_NAME, "mdc-button__label"))
                            )
                            新增來源 = driver.find_elements(By.CLASS_NAME, "mdc-button__label")[2]
                            print(新增來源.is_displayed())
                            while not 新增來源:
                                time.sleep(0.1)
                                新增來源 = driver.find_element(By.CLASS_NAME, "mdc-button__label")[2]
                            新增來源.click()
                        except Exception as e:
                            print(f'Error: {e}')
                            # 嘗試使用 JavaScript 來檢查元素是否可用
                            driver.execute_script("return document.querySelector('button #mat-tab-group-0-content-0 > div > div > div > source-picker > div > div.ng-tns-c2551705568-5.ng-star-inserted > button > span.mat-mdc-button-persistent-ripple.mdc-button__ripple') != null")
                        time.sleep(3)
                        # 找網站按鈕
                        try:
                            # 取得整個按鈕列表
                            clicks = driver.find_element(By.CLASS_NAME, 'chip-groups').find_elements(By.CSS_SELECTOR, '.chip-group.ng-star-inserted')
                            print(clicks)
                            text_click = clicks[1].find_element(By.TAG_NAME, 'mat-chip')
                            while not text_click:
                                time.sleep(0.1)
                                text_click = clicks[1].find_element(By.TAG_NAME, 'mat-chip')
                            text_click.click()
                        except Exception as e:
                            print(f'Error: {e}')
                            # 如果捕獲到錯誤，使用 JavaScript 強制點擊
                            try:
                                text_click = clicks[1].find_element(By.TAG_NAME, 'mat-chip')
                                driver.execute_script("arguments[0].click();", text_click)  # 使用 JS 點擊元素
                                print("使用 JavaScript 成功點擊元素")
                            except Exception as js_error:
                                print(f'JavaScript 點擊錯誤: {js_error}')
                        time.sleep(2)
                        # 找到 textarea 並輸入網址
                        # 取得上面幾層
                        div = driver.find_element(By.TAG_NAME, 'website-upload')
                        # print(div.text)
                        website_input = div.find_element(By.TAG_NAME, 'mat-form-field').find_element(By.TAG_NAME, 'input')
                        # 確保元素可以操作
                        while not website_input.is_displayed() or not website_input.is_enabled():
                            time.sleep(0.1)
                            website_input = div.find_element(By.TAG_NAME, 'mat-form-field').find_element(By.TAG_NAME, 'input')
                        try:
                            website_input.send_keys(url)
                            website_input.send_keys(Keys.RETURN)
                        except Exception as e:
                            print(f'Error: {e}')
                            # 使用 JavaScript 強制發送鍵盤事件
                            driver.execute_script("arguments[0].value = arguments[1];", website_input, url)  # 將文字輸入到 input
                            driver.execute_script("arguments[0].dispatchEvent(new Event('input'));", website_input)  # 觸發 input 事件
                        print(f'成功輸入網址: {url}')
                        time.sleep(7)            
                    
                time.sleep(5)

                # 上面幾層
                omnibar = driver.find_element(By.TAG_NAME, 'chat-panel').find_element(By.TAG_NAME, 'omnibar')
                box = omnibar.find_element(By.TAG_NAME, 'query-box')
                prompt_input = box.find_element(By.TAG_NAME, 'textarea')
                while not prompt_input.is_displayed() or not prompt_input.is_enabled():
                    time.sleep(0.1)
                    prompt_input = box.find_element(By.TAG_NAME, 'textarea')
                
                j = 1
                fore_content = ''
                # while '##' not in fore_content:
                    # if j > 5:
                    #     break
                    # 輸入生成文章的 prompt
                    # complete_prompt = "1. 彙整並生成一篇以常見問題為主的 MarkDown 格式的 專業文章(給客戶看的，最後一句「希望這篇文章能解答...」整句直接刪掉)。2. 文章必須為 MarkDown 格式，也就是必須包含#、##等等。3. 在各個分類或重要段落中加入一些具體的案例或數據。4. 必須使用段落格式，不要條列式。5. 若內容不是中文，要精準翻譯成中文。6. 生成 MarkDown 的 md 檔(須包含大標題及各個段落的標題)。"
                #complete_prompt = "彙整並生成一篇以常見問題為主的 MarkDown 格式的 專業文章。在各個分類或重要段落中加入一些具體的案例或數據。5. 若內容不是中文，要精準翻譯成中文。"
                complete_prompt = f"1. 彙整並生成一篇以常見問題為主的 MarkDown 格式的 專業文章(給客戶看的，最後一句「希望這篇文章能解答...」整句直接刪掉)。2. 文章必須出現多次這組字:{self.keyword}。3. 在各個分類或重要段落中加入一些具體的案例或數據。4. 必須使用段落格式，不要條列式。5. 若內容不是中文，要精準翻譯成中文。6. 生成 MarkDown 的 md 檔(須包含大標題及各個段落的標題)。"
                try:
                    prompt_input.send_keys(complete_prompt)
                    prompt_input.send_keys(Keys.RETURN)
                except Exception as e:
                    print(f'Error: {e}')
                    # 使用 JavaScript 強制發送鍵盤事件
                    driver.execute_script("arguments[0].value = arguments[1];", prompt_input, complete_prompt)  # 將文字輸入到 input
                    driver.execute_script("arguments[0].dispatchEvent(new Event('input'));", prompt_input)  # 觸發 input 事件


                print('等候答案')
                time.sleep(40)  # 等待答案生成
                message = driver.find_element(By.TAG_NAME, 'chat-panel').find_elements(By.TAG_NAME, 'chat-message')[j]
                while not message.is_displayed() or not message.is_enabled():
                    time.sleep(0.1)
                    message = driver.find_element(By.TAG_NAME, 'chat-panel').find_elements(By.TAG_NAME, 'chat-message')[j]
                # answers = message.find_elements(By.TAG_NAME, 'labs-tailwind-structural-element-view-v2')
                # while not answers[1].is_displayed() or not answers[1].is_enabled():
                #     time.sleep(0.1)
                #     answers = message.find_elements(By.TAG_NAME, 'labs-tailwind-structural-element-view-v2')
                # fore_content = message.text
                    # j+=2
                
                # 直接生成 md 檔 或是 生成文字?
                # texts = []
                # if '#' not in answers[0].text:
                #     for i, answer in enumerate(answers[1:]):
                #         if '##' not in fore_content:
                #             if i == 0:
                #                 texts.append('# ' + answer.text.strip())
                #             elif i % 2 == 1 and i != len(answers[1:])-1:
                #                 texts.append('## ' + answer.text.strip())
                #             elif i == len(answers[1:])-1:
                #                 continue
                #             else:  
                #                 texts.append(answer.text.strip())
                #         else:
                #             if '# # ' in answer.text:
                #                 t = answer.text.replace('# # ', '# ').strip()
                #                 texts.append(t)
                #             else:
                #                 texts.append(answer.text.strip())
                # else:
                #     for i, answer in enumerate(answers):
                #         if '##' not in fore_content:
                #             if i == 0:
                #                 texts.append('# ' + answer.text.strip())
                #             elif i % 2 == 1 and i != len(answers[1:])-1:
                #                 texts.append('## ' + answer.text.strip())
                #             elif i == len(answers[1:])-1:
                #                 continue
                #             else:  
                #                 texts.append(answer.text.strip())
                #         else:
                #             if '# # ' in answer.text:
                #                 t = answer.text.replace('# # ', '# ').strip()
                #                 texts.append(t)
                #             else:
                #                 texts.append(answer.text.strip())
                
                # print(texts)
                # if len(texts) > 1:
                #     content = '\n'.join(texts)
                # else:
                #     content = texts[0].replace('# # ', '# ')
                # print(content)
                text = self.ttm(message.text)
                is_succesed = self.save_article_as_md(text)
                if is_succesed:
                    return 'article.md 生成成功'
                else:
                    return '存檔失敗'
            except Exception as e:
                print("出現錯誤: ", str(e))
                self.driver.quit()
                self.driver = None

        if self.driver:
            time.sleep(5)
            self.driver.quit()
            return False
    
    def save_article_as_md(self, content, filename="./article.md"):
        # 打開或創建一個 .md 文件
        try:
            with open(filename, 'w+', encoding='utf-8-sig') as file:
                # 將文章內容寫入文件
                file.write(content)
                print(f"文章已成功保存為 {filename}")
            return True
        except Exception as e:
            print(f"保存文章時發生錯誤: {str(e)}")
            return False

# 假設生成的文章內容如下（這會是你的生成內容）
# 測試區域（僅在此模組直接執行時執行）
if __name__ == "__main__":
    # driver_path = 'chromedriver-win32/chromedriver.exe'
    user_data_dir = 'C:/Users/wangy/AppData/Local/Google/Chrome/User Data'
    profile_directory = 'Profile 20'

    # 範例用法
    article_generator = ArticleGenerator(user_data_dir, profile_directory)
    urls = [
        "https://zh.wikipedia.org/zh-tw/%E8%99%9B%E6%93%AC%E4%BA%BA",
        "https://www.naipo.com/Portals/1/web_tw/Knowledge_Center/Industry_Insight/IPNC_240515_1501.htm",
        "https://money.udn.com/money/story/11162/8333646", 
        "https://gitmind.com/tw/digital-human-creator.html", 
        "https://support.microsoft.com/zh-tw/office/%E5%9C%A8-microsoft-teams-%E4%B8%AD%E4%BB%A5%E8%99%9B%E6%93%AC%E4%BA%BA%E5%81%B6%E7%9A%84%E5%BD%A2%E5%BC%8F%E5%8A%A0%E5%85%A5%E6%9C%83%E8%AD%B0-5384e7b7-30c7-4bcb-8065-0c9e830cc8ad", 
        "https://digitaldomain.com/%E8%99%9B%E6%93%AC%E4%BA%BA%E5%AF%A6%E9%A9%97%E5%AE%A4/?lang=zh-hant", 
        "https://www.cdri.org.tw/xcdoc/cont?xsmsid=0H270572678476094046&sid=0N149542836021459905", 
        "https://learn.microsoft.com/zh-tw/azure/ai-services/speech-service/text-to-speech-avatar/what-is-text-to-speech-avatar", 
        "https://www.xfyun.cn/services/VirtualHumans", 
        "https://www.bnext.com.tw/article/65449/virtual-human-subculture"
    ]

    is_succesed = article_generator.generate_article(urls)
    if is_succesed:
        print(is_succesed)

    # import notebookllm
    # # 設置包含網址的 prompt
    # prompt = "請分析這個網址內容並轉成 md 格式文章： https://zh.wikipedia.org/zh-tw/%E8%99%9B%E6%93%AC%E4%BA%BA"

    # # 發送 prompt 給模型
    # response = notebookllm.Notebook()

    # # 顯示模型的回應
    # print(response)