resources_notebook.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. import undetected_chromedriver as uc
  2. import time
  3. import os
  4. import urllib
  5. from selenium.webdriver.common.by import By
  6. from pyvirtualdisplay import Display
  7. import sys
  8. import urlToarticle
  9. from fastapi import FastAPI, Form, Request
  10. from fastapi.responses import FileResponse, HTMLResponse
  11. from fastapi.templating import Jinja2Templates
  12. from pydantic import BaseModel
  13. from typing import List
  14. import undetected_chromedriver as uc
  15. app = FastAPI()
  16. templates = Jinja2Templates(directory="templates")
  17. driver = None
  18. driver_path = '/usr/local/bin/chromedriver'
  19. user_data_dir = '/home/ling/.config/google-chrome'
  20. profile_directory = 'Default'
  21. def re_get_webdriver():
  22. global port
  23. global driver
  24. global portnum
  25. global is_docker
  26. result = []
  27. if driver is not None:
  28. print('closing....')
  29. driver.quit()
  30. print('quit....')
  31. driver = None
  32. try:
  33. display = Display(visible=0, size=(1920, 1080))
  34. display.start()
  35. options = uc.ChromeOptions()
  36. #options.add_argument("--window-size=200,100") # 縮小視窗
  37. #options.add_argument("--window-position=-32000,-32000") # 移到螢幕外
  38. options.add_argument("--no-sandbox")
  39. options.add_argument("--disable-dev-shm-usage")
  40. options.add_argument("--disable-blink-features=AutomationControlled")
  41. driver = uc.Chrome(options=options,version_main=132)
  42. driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
  43. except Exception as e:
  44. print(f"Chrome 啟動失敗: {str(e)}")
  45. driver = None
  46. return None
  47. return driver
  48. def get_resource(kw):
  49. max_attempts = 2
  50. attempts = 0
  51. while attempts<max_attempts:
  52. driver = re_get_webdriver()
  53. print('re_get_webdriver')
  54. if driver is not None:
  55. break
  56. time.sleep(3)
  57. attempts+=1
  58. try:
  59. googleurl = 'https://www.google.com/search?q={}&num={}&hl={}&gl=tw'.format(urllib.parse.quote(kw), 100, 'zh-TW')
  60. # googleurl = 'https://www.google.com/search?q={}&num={}&hl={}&gl=tw&tbm=vid&tbs=vd:m'.format(urllib.parse.quote(kw), 100, 'zh-TW')
  61. # googleurl = f'https://www.google.co.jp/search?q={kw}&sca_esv=741dc4f98c90c9c4&source=hp&ei=djmOZ8inMYWk2roPk_yMiA4&iflsig=AL9hbdgAAAAAZ45HhiuBAUgi3Vf3Qd5FTyfcyUOySOxk&ved=0ahUKEwjIutTinoSLAxUFklYBHRM-A-EQ4dUDCA8&uact=5&oq=junho&gs_lp=Egdnd3Mtd2l6IgphbmdlbG8ga29vMgUQLhiABDIEEAAYHjIEEAAYHjIEEAAYHjIEEAAYHjIEEAAYHjIEEAAYHjIEEAAYHjIGEAAYChgeSL0YUABYqRZwAXgAkAEAmAGwAaABjQyqAQQwLjExuAEDyAEA-AEBmAIMoALYDMICCxAuGIAEGNEDGMcBwgIFEAAYgATCAgoQLhiABBhDGIoFwgILEC4YgAQYxwEYrwHCAgcQABiABBgKwgIHEC4YgAQYCsICDRAuGIAEGMcBGAoYrwGYAwCSBwQxLjExoAfBqQE&sclient=gws-wiz'
  62. driver.get(googleurl)
  63. time.sleep(6)
  64. print(driver.current_url)
  65. elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']//a")
  66. numresults = len(elmts)
  67. print('搜尋結果數量', numresults)
  68. if numresults == 0:
  69. print(driver.current_url)
  70. print(driver.title)
  71. sys.exit()
  72. resources_list = []
  73. for elmt in elmts[0:11]:
  74. href = elmt.get_attribute('href')
  75. resources_list.append(href)
  76. print(resources_list)
  77. return resources_list
  78. except Exception as e:
  79. print('exception')
  80. return None
  81. driver.quit()
  82. if __name__ == "__main__":
  83. urls = get_resource('書房 設計 北歐') # 取得搜尋結果第一頁網址來源
  84. #urls = ['https://www.100.com.tw/article/4359', 'https://yes-99.com/news-info.asp?id=278']
  85. #article_generator = urlToarticle.ArticleGenerator(user_data_dir, profile_directory)
  86. #is_succesed = article_generator.generate_article(urls)
  87. #if is_succesed:
  88. # print(is_succesed)
  89. class SearchRequest(BaseModel):
  90. keyword: str
  91. class GenerateRequest(BaseModel):
  92. urls: List[str]
  93. keyword: str
  94. @app.post("/generate")
  95. async def generate_article(keyword: str = Form(...)):
  96. urls = get_resource(keyword)
  97. # 確保搜尋成功
  98. if not urls:
  99. return HTMLResponse(content="<h2>搜尋失敗,請回上一頁重試!</h2>", status_code=400)
  100. article_generator = urlToarticle.ArticleGenerator(user_data_dir, profile_directory, keyword)
  101. is_success = article_generator.generate_article(urls)
  102. if not is_success:
  103. return HTMLResponse(content="<h2>文章生成失敗!</h2>", status_code=500)
  104. file_path = "./article.md"
  105. if not os.path.exists(file_path):
  106. return HTMLResponse(content="<h2>找不到生成的 Markdown 檔案!</h2>", status_code=404)
  107. # 回傳 Markdown 檔案
  108. return FileResponse(file_path, media_type="text/markdown", filename="article.md")
  109. @app.get("/search", response_class=HTMLResponse)
  110. async def search_page(request: Request):
  111. return templates.TemplateResponse("search.html", {"request": request})