Browse Source

更新FastAPI

Your Name 7 months ago
parent
commit
f07825e860
4 changed files with 315 additions and 53 deletions
  1. 129 0
      create_profile.py
  2. 55 30
      resources_notebook.py
  3. 108 0
      test.py
  4. 23 23
      urlToarticle.py

+ 129 - 0
create_profile.py

@@ -0,0 +1,129 @@
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+import os
+import time
+import undetected_chromedriver as uc
+from pyvirtualdisplay import Display
+
+
+def create_chrome_profile(profile_dir, username, password):
+    """
+    在headless環境下創建Chrome profile
+    
+    Args:
+        profile_dir: profile儲存目錄
+        username: Google帳號
+        password: Google密碼
+    """
+    
+    # 設置Chrome選項
+    display = Display(visible=0, size=(1920, 1080))
+    display.start()
+    options = uc.ChromeOptions()
+    #options.add_argument("--window-size=200,100")  # 縮小視窗
+    #options.add_argument("--window-position=-32000,-32000")  # 移到螢幕外
+    options.add_argument("--no-sandbox")
+    options.add_argument("--disable-dev-shm-usage")
+    options.add_argument("--disable-blink-features=AutomationControlled")
+    options.add_argument(f"--user-data-dir={profile_dir}")
+    driver = uc.Chrome(options=options,driver_executable_path="/usr/local/bin/chromedriver")
+    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
+    print('開啟')
+    try:
+        # 訪問Google登入頁面
+        driver.get('https://accounts.google.com')
+        print(driver.current_url)
+        # 等待並輸入郵箱
+        email_input = WebDriverWait(driver, 10).until(
+            EC.presence_of_element_located((By.NAME, "identifier"))
+        )
+        email_input.send_keys(username)
+
+
+        try:
+            # 方法1: 使用jsname屬性
+            next_button = WebDriverWait(driver, 5).until(
+                EC.element_to_be_clickable((By.CSS_SELECTOR, "button[jsname='LgbsSe']"))
+            )
+        except:
+            try:
+                # 方法2: 使用完整的class組合
+                next_button = WebDriverWait(driver, 5).until(
+                    EC.element_to_be_clickable((By.CSS_SELECTOR, "button.VfPpkd-LgbsSe.VfPpkd-LgbsSe-OWXEXe-k8QpJ.VfPpkd-LgbsSe-OWXEXe-dgl2Hf"))
+                )
+            except:
+                # 方法3: 通過span文本找到父按鈕
+                next_button = WebDriverWait(driver, 5).until(
+                    EC.element_to_be_clickable((By.XPATH, "//button[.//span[contains(text(), '下一步')]]"))
+                )
+        
+        print("找到下一步按鈕,準備點擊")
+        next_button.click()
+        
+        # 等待頁面加載完成
+        time.sleep(2)
+
+
+        # 等待並輸入密碼
+        password_input = WebDriverWait(driver, 10).until(
+            EC.presence_of_element_located((By.NAME, "Passwd"))
+        )
+        password_input.send_keys(password)
+
+        try:
+            next_button = WebDriverWait(driver, 5).until(
+                EC.element_to_be_clickable((By.CSS_SELECTOR, "button[jsname='LgbsSe']"))
+            )
+        except:
+            try:
+                next_button = WebDriverWait(driver, 5).until(
+                    EC.element_to_be_clickable((By.CSS_SELECTOR, "button.VfPpkd-LgbsSe.VfPpkd-LgbsSe-OWXEXe-k8QpJ.VfPpkd-LgbsSe-OWXEXe-dgl2Hf"))
+                )
+            except:
+                next_button = WebDriverWait(driver, 5).until(
+                    EC.element_to_be_clickable((By.XPATH, "//button[.//span[contains(text(), '下一步')]]"))
+                )
+
+
+
+        next_button.click()
+        print('輸入密碼成功')
+        
+        # 等待登入完成
+        time.sleep(3)
+        # 尋找並獲取samp元素中的數值
+        #samp_element = WebDriverWait(driver, 10).until(
+        #    EC.presence_of_element_located((By.CSS_SELECTOR, "samp.Sevzkc[jsname='feLNVc']"))
+        #)
+        #number_value = int(samp_element.text)
+        #print(f"獲取到的數值: {number_value}")
+        #time.sleep(30)
+
+        
+        # 訪問一些Google服務以確保profile正確創建
+
+        services = ["https://notebooklm.google/"]
+        
+        for service in services:
+            driver.get(service)
+            time.sleep(2)
+            print('成功登入notebook',driver.current_url)
+            time.sleep(10)
+        print(f"Profile successfully created at: {profile_dir}")
+        
+    except Exception as e:
+        print(f"Error creating profile: {str(e)}")
+        
+    finally:
+        driver.quit()
+
+if __name__ == "__main__":
+    # 使用示例
+    
+    profile_dir = "/home/ling/.config/google-chrome/Profile2"
+    username = ""
+    password = ""
+    create_chrome_profile(profile_dir, username, password)

+ 55 - 30
resources_notebook.py

@@ -3,19 +3,23 @@ import time
 import os
 import urllib
 from selenium.webdriver.common.by import By
+from pyvirtualdisplay import Display
 import sys
 import urlToarticle
-from fastapi import FastAPI
+from fastapi import FastAPI, Form, Request
+from fastapi.responses import FileResponse, HTMLResponse
+from fastapi.templating import Jinja2Templates
 from pydantic import BaseModel
 from typing import List
-
+import undetected_chromedriver as uc
 
 app = FastAPI()
+templates = Jinja2Templates(directory="templates")
 driver = None
 
-driver_path = 'C:\/Users\/s1301\/Downloads\/132\/chromedriver-win32\/chromedriver.exe'
-user_data_dir = 'C:/Users/s1301/AppData/Local/Google/Chrome/User Data'
-profile_directory = 'Profile 1'
+driver_path = '/usr/local/bin/chromedriver'
+user_data_dir = '/home/ling/.config/google-chrome'
+profile_directory = 'Default'
 
 
 def re_get_webdriver():
@@ -30,29 +34,37 @@ def re_get_webdriver():
         print('quit....')
         driver = None
     try:
+        display = Display(visible=0, size=(1920, 1080))
+        display.start()
         options = uc.ChromeOptions()
-        # options.add_argument("--user-agent=" + "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19")
-
-        options.add_argument("--window-size=200,100")  # 縮小視窗
-        options.add_argument("--window-position=-32000,-32000")  # 移到螢幕外
-        # for window in gw.getWindowsWithTitle("Chrome"):
-        #     window.minimize()
-        driver = uc.Chrome(options=options)
-        driver.delete_all_cookies()
-    except:
+        #options.add_argument("--window-size=200,100")  # 縮小視窗
+        #options.add_argument("--window-position=-32000,-32000")  # 移到螢幕外
+        options.add_argument("--no-sandbox")
+        options.add_argument("--disable-dev-shm-usage")
+        options.add_argument("--disable-blink-features=AutomationControlled")
+
+        driver = uc.Chrome(options=options,version_main=132)
+        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
+
+    except Exception as e:
+        print(f"Chrome 啟動失敗: {str(e)}")
         driver = None
+        
         return None
 
     return driver
 
 
 def get_resource(kw):
-    while True:
+    max_attempts = 2
+    attempts = 0
+    while attempts<max_attempts:
         driver = re_get_webdriver()
         print('re_get_webdriver')
         if driver is not None:
             break
         time.sleep(3)
+        attempts+=1
     try:
         googleurl = 'https://www.google.com/search?q={}&num={}&hl={}&gl=tw'.format(urllib.parse.quote(kw), 100, 'zh-TW')
         # googleurl = 'https://www.google.com/search?q={}&num={}&hl={}&gl=tw&tbm=vid&tbs=vd:m'.format(urllib.parse.quote(kw), 100, 'zh-TW')
@@ -83,12 +95,13 @@ def get_resource(kw):
 
     driver.quit()
 
-# urls = get_resource('書房 設計 北歐') # 取得搜尋結果第一頁網址來源
-# urls = ['https://www.100.com.tw/article/4359', 'https://yes-99.com/news-info.asp?id=278', 'https://www.seec.com.tw/content/Goods/List.aspx?SiteID=10&MmmID=655575436061073254&CatId=2015120918304527132']
-# article_generator = urlToarticle.ArticleGenerator(user_data_dir, profile_directory, driver_path)
-# is_succesed = article_generator.generate_article(urls)
-# if is_succesed:
-#     print(is_succesed)
+if __name__ == "__main__":
+    urls = get_resource('書房 設計 北歐') # 取得搜尋結果第一頁網址來源
+#urls = ['https://www.100.com.tw/article/4359', 'https://yes-99.com/news-info.asp?id=278']
+#article_generator = urlToarticle.ArticleGenerator(user_data_dir, profile_directory)
+#is_succesed = article_generator.generate_article(urls)
+#if is_succesed:
+#    print(is_succesed)
 
 
 class SearchRequest(BaseModel):
@@ -96,17 +109,29 @@ class SearchRequest(BaseModel):
 
 class GenerateRequest(BaseModel):
     urls: List[str]
+    keyword: str
+
+@app.post("/generate")
+async def generate_article(keyword: str = Form(...)):
+    urls = get_resource(keyword)
+    # 確保搜尋成功
+    if not urls:
+        return HTMLResponse(content="<h2>搜尋失敗,請回上一頁重試!</h2>", status_code=400)
+
+    article_generator = urlToarticle.ArticleGenerator(user_data_dir, profile_directory, keyword)
+    is_success = article_generator.generate_article(urls)
 
-@app.post("/search/")
-async def search_resource(request: SearchRequest):
-    urls = get_resource(request.keyword)
-    return {"urls":urls}
+    if not is_success:
+        return HTMLResponse(content="<h2>文章生成失敗!</h2>", status_code=500)
 
+    file_path = "./article.md"
+    if not os.path.exists(file_path):
+        return HTMLResponse(content="<h2>找不到生成的 Markdown 檔案!</h2>", status_code=404)
 
-@app.post("/generate/")
-async def generate_article(request: GenerateRequest):
-    article_generator = urlToarticle.ArticleGenerator(user_data_dir, profile_directory, driver_path)
-    is_success = article_generator.generate_article(request.urls)
-    return {"success": is_success}
+    # 回傳 Markdown 檔案
+    return FileResponse(file_path, media_type="text/markdown", filename="article.md")
 
 
+@app.get("/search", response_class=HTMLResponse)
+async def search_page(request: Request):
+    return templates.TemplateResponse("search.html", {"request": request})

+ 108 - 0
test.py

@@ -0,0 +1,108 @@
+import time
+import os
+import urllib
+from selenium.webdriver.common.by import By
+from pyvirtualdisplay import Display
+import sys
+import urlToarticle
+from fastapi import FastAPI, Form, Request
+from fastapi.responses import FileResponse, HTMLResponse
+from fastapi.templating import Jinja2Templates
+from pydantic import BaseModel
+from typing import List
+from pyvirtualdisplay import Display
+from selenium.webdriver.chrome.service import Service
+import undetected_chromedriver as uc
+
+app = FastAPI()
+templates = Jinja2Templates(directory="templates")
+driver = None
+
+driver_path = '/usr/local/bin/chromedriver'
+user_data_dir = '/home/ling/.config/google-chrome'
+profile_directory = 'Default'
+
+
+def re_get_webdriver():
+    global port
+    global driver
+    global portnum
+    global is_docker
+    result = []
+    if driver is not None:
+        print('closing....')
+        driver.quit()
+        print('quit....')
+        driver = None
+    try:
+        display = Display(visible=0, size=(1920, 1080))
+        display.start()
+
+        service = Service(driver_path)
+        options = uc.ChromeOptions()
+        #options.add_argument("--window-size=200,100")  # 縮小視窗
+        #options.add_argument("--window-position=-32000,-32000")  # 移到螢幕外
+        options.add_argument("--no-sandbox")
+        options.add_argument("--disable-dev-shm-usage")
+        options.add_argument("--disable-blink-features=AutomationControlled")
+
+        driver = uc.Chrome(options=options, version_main=132)
+
+        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
+        time.sleep(3)
+        print(driver.current_url)
+    except:
+        driver = None
+        return driver
+
+    return driver
+
+
+def get_resource(kw):
+    max_attempts = 2
+    attempts = 0
+    while attempts<max_attempts:
+        driver = re_get_webdriver()
+        print('re_get_webdriver')
+        if driver is not None:
+            break
+        time.sleep(3)
+        attempts+=1
+    try:
+        googleurl = 'https://www.google.com/search?q={}&num={}&hl={}&gl=tw'.format(urllib.parse.quote(kw), 100, 'zh-TW')
+        # googleurl = 'https://www.google.com/search?q={}&num={}&hl={}&gl=tw&tbm=vid&tbs=vd:m'.format(urllib.parse.quote(kw), 100, 'zh-TW')
+        # googleurl = f'https://www.google.co.jp/search?q={kw}&sca_esv=741dc4f98c90c9c4&source=hp&ei=djmOZ8inMYWk2roPk_yMiA4&iflsig=AL9hbdgAAAAAZ45HhiuBAUgi3Vf3Qd5FTyfcyUOySOxk&ved=0ahUKEwjIutTinoSLAxUFklYBHRM-A-EQ4dUDCA8&uact=5&oq=junho&gs_lp=Egdnd3Mtd2l6IgphbmdlbG8ga29vMgUQLhiABDIEEAAYHjIEEAAYHjIEEAAYHjIEEAAYHjIEEAAYHjIEEAAYHjIEEAAYHjIGEAAYChgeSL0YUABYqRZwAXgAkAEAmAGwAaABjQyqAQQwLjExuAEDyAEA-AEBmAIMoALYDMICCxAuGIAEGNEDGMcBwgIFEAAYgATCAgoQLhiABBhDGIoFwgILEC4YgAQYxwEYrwHCAgcQABiABBgKwgIHEC4YgAQYCsICDRAuGIAEGMcBGAoYrwGYAwCSBwQxLjExoAfBqQE&sclient=gws-wiz'
+        driver.get(googleurl)
+
+        time.sleep(6)
+        print(driver.current_url)
+        elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']//a")
+        numresults = len(elmts)
+        print('搜尋結果數量', numresults)
+        if numresults == 0:
+            print(driver.current_url)
+            print(driver.title)
+            sys.exit()
+
+        resources_list = []
+
+        for elmt in elmts[0:11]:
+            href = elmt.get_attribute('href')
+            resources_list.append(href)
+        print(resources_list)
+        return resources_list
+
+    except Exception as e:
+        print('exception')
+        return None
+
+    driver.quit()
+
+urls = get_resource('書房 設計 北歐') # 取得搜尋結果第一頁網址來源
+#urls = ['https://www.100.com.tw/article/4359', 'https://yes-99.com/news-info.asp?id=278']
+#article_generator = urlToarticle.ArticleGenerator(user_data_dir, profile_directory)
+#is_succesed = article_generator.generate_article(urls)
+#if is_succesed:
+#    print(is_succesed)
+
+

+ 23 - 23
urlToarticle.py

@@ -12,6 +12,7 @@ import undetected_chromedriver as uc
 from dotenv import load_dotenv
 import os
 import logging
+from pyvirtualdisplay import Display
 
 # 加载环境变量
 load_dotenv()
@@ -19,27 +20,29 @@ load_dotenv()
 logging.basicConfig(level=logging.INFO)
 
 class ArticleGenerator:
-    def __init__(self, user_data_dir, profile_directory):
+    def __init__(self, user_data_dir, profile_directory, keyword):
         self.user_data_dir = user_data_dir
         self.profile_directory = profile_directory
         self.driver = None
+        self.keyword = keyword
         self.OPEN_API_KEY: str = os.getenv('OPEN_API_KEY')
 
     def get_webdriver(self):
+        os.system("pkill -f chrome")
         for attempt in range(3):  # 嘗試最多 3 次
             try:
+                display = Display(visible=0, size=(1920, 1080))
+                display.start()
                 options = uc.ChromeOptions()
-                # options.add_argument("--disable-blink-features=AutomationControlled")
-                # options.add_experimental_option("excludeSwitches", ["enable-automation"])
-                # options.add_experimental_option("useAutomationExtension", False)
-                options.add_argument('--ignore-certificate-errors')
-                options.add_argument("--disable-gpu")
+                options.add_argument('--disable-setuid-sandbox')
+                options.add_argument("--no-sandbox")
                 options.add_argument("--disable-dev-shm-usage")
-                # options.add_argument("headless")
-                options.add_argument(f"user-data-dir={self.user_data_dir}")
-                options.add_argument(f'--profile-directory={self.profile_directory}')
-                # s = Service(self.driver_path)
-                self.driver=uc.Chrome(options=options, version_main=132, use_subprocess=True)
+                options.add_argument("--disable-blink-features=AutomationControlled")
+                #options.add_argument(f"--user-data-dir={self.user_data_dir}")  # Linux Profile
+                #options.add_argument(f'--profile-directory=Profile1')
+                options.add_argument(f"--user-data-dir=/home/ling/.config/google-chrome/Profile2")
+
+                self.driver=uc.Chrome(options=options,version_main=132)
                 self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
                 return self.driver
 
@@ -90,7 +93,7 @@ class ArticleGenerator:
                 url = "https://notebooklm.google.com/"
                 driver.get(url)
                 time.sleep(3)
-                
+                print('notebook這裡',driver.current_url)
                 # 新建專案 
                 new_created = driver.find_element(By.XPATH, '/html/body/labs-tailwind-root/div/welcome-page/div/div[2]/div[1]/div/button/span[2]')
                 while not new_created:
@@ -114,23 +117,19 @@ class ArticleGenerator:
                 for i, url in enumerate(urls):
                     time.sleep(5)
                     if i == 0:
-                        upload_urls = driver.find_element(By.CSS_SELECTOR, '.mat-icon.notranslate.upload-icon.google-symbols.mat-icon-no-color')
-                        driver.execute_script("arguments[0].scrollIntoView(true);", upload_urls)
-                        while not upload_urls:
-                            time.sleep(0.1)
-                            upload_urls = driver.find_element(By.CSS_SELECTOR, '.mat-icon.notranslate.upload-icon.google-symbols.mat-icon-no-color')
-                            driver.execute_script("arguments[0].scrollIntoView(true);", upload_urls)
-                        upload_urls.click()
-                        time.sleep(3)
                         # 找網站按鈕
                         text_click = driver.find_element(By.XPATH, '//*[@id="mat-mdc-chip-2"]/span[2]/span/span[2]')
+                        print('這裡',text_click)
+                        time.sleep(1)
                         driver.execute_script("arguments[0].scrollIntoView(true);", text_click)
                         while not text_click:
-                            time.sleep(0.1)
+                            time.sleep(1)
                             text_click = driver.find_element(By.XPATH, '//*[@id="mat-mdc-chip-2"]/span[2]/span/span[2]')
                             driver.execute_script("arguments[0].scrollIntoView(true);", text_click)
-                        text_click.click()
+                        driver.execute_script("arguments[0].click();", text_click)
+                        # text_click.click()
                         time.sleep(1)
+
                         # 找到 textarea 並輸入網址
                         website_input = driver.find_element(By.ID, 'mat-input-0')
                         while not text_click:
@@ -213,7 +212,8 @@ class ArticleGenerator:
                     #     break
                     # 輸入生成文章的 prompt
                     # complete_prompt = "1. 彙整並生成一篇以常見問題為主的 MarkDown 格式的 專業文章(給客戶看的,最後一句「希望這篇文章能解答...」整句直接刪掉)。2. 文章必須為 MarkDown 格式,也就是必須包含#、##等等。3. 在各個分類或重要段落中加入一些具體的案例或數據。4. 必須使用段落格式,不要條列式。5. 若內容不是中文,要精準翻譯成中文。6. 生成 MarkDown 的 md 檔(須包含大標題及各個段落的標題)。"
-                complete_prompt = "彙整並生成一篇以常見問題為主的 MarkDown 格式的 專業文章。在各個分類或重要段落中加入一些具體的案例或數據。5. 若內容不是中文,要精準翻譯成中文。"
+                #complete_prompt = "彙整並生成一篇以常見問題為主的 MarkDown 格式的 專業文章。在各個分類或重要段落中加入一些具體的案例或數據。5. 若內容不是中文,要精準翻譯成中文。"
+                complete_prompt = f"1. 彙整並生成一篇以常見問題為主的 MarkDown 格式的 專業文章(給客戶看的,最後一句「希望這篇文章能解答...」整句直接刪掉)。2. 文章必須出現多次這組字:{self.keyword}。3. 在各個分類或重要段落中加入一些具體的案例或數據。4. 必須使用段落格式,不要條列式。5. 若內容不是中文,要精準翻譯成中文。6. 生成 MarkDown 的 md 檔(須包含大標題及各個段落的標題)。"
                 try:
                     prompt_input.send_keys(complete_prompt)
                     prompt_input.send_keys(Keys.RETURN)