fetch__url_content.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. import traceback
  2. from selenium import webdriver
  3. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  4. import time
  5. import os
  6. import datetime
  7. import urllib.parse
  8. from selenium.webdriver.support.ui import WebDriverWait
  9. from selenium.webdriver.common.by import By
  10. from selenium.webdriver.support import expected_conditions as EC
  11. import codecs
  12. import random
  13. from bs4 import BeautifulSoup
  14. import requests
  15. import time
  16. # import rpyc
  17. import sys
  18. import docker
  19. # import googlesearch
  20. import codecs
  21. import sys
  22. import time
  23. import dataset
  24. import os
  25. import html2text
  26. from userAgentRandomizer import userAgents
  27. from fp.fp import FreeProxy
  28. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  29. db2 = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  30. cursor=db.query('select url from kw_url_search_result where url not in (select url from url_content) order by rand()')
  31. def restart_browser():
  32. os.system('docker container restart proxy1')
  33. ua = userAgents()
  34. user_agent = ua.random()
  35. time.sleep(8)
  36. options = webdriver.ChromeOptions()
  37. # options.add_argument("--headless")
  38. # proxy = FreeProxy().get()
  39. # print(proxy)
  40. # sys.exit()
  41. options.add_argument('--proxy-server=socks5://127.0.0.1:9050')
  42. options.add_argument("--user-agent=" +user_agent)
  43. options.add_argument("--incognito")
  44. driver=webdriver.Chrome(options=options)
  45. driver.set_window_size(1400,1000)
  46. driver.delete_all_cookies()
  47. return driver
  48. def clean_txt(txt):
  49. fulltxt=""
  50. lines=txt.split("\n")
  51. beginning=False
  52. for l in lines:
  53. ltxt=l.strip()
  54. if ' * __'==ltxt:
  55. continue
  56. if '我要回覆'==ltxt:
  57. continue
  58. if beginning:
  59. fulltxt+=l+"\n"
  60. else:
  61. if ' * __ 訂閱文章' in l:
  62. beginning=True
  63. if ('__ 連結 __ 回報 __ 只看樓主 __ 列印' in l):
  64. break
  65. if '__ 連結 __ 回報 __ 只看此人 __ 列印' in l:
  66. break
  67. print(fulltxt)
  68. return fulltxt
  69. driver=restart_browser()
  70. table=db2['url_content']
  71. for c in cursor:
  72. url=c['url']
  73. print(c['url'])
  74. # driver.get('https://whatismyipaddress.com/')
  75. # time.sleep(9999)
  76. driver.get(c['url'])
  77. time.sleep(5)
  78. if 'Please Wait' in driver.title and 'Cloudflare' in driver.title:
  79. driver=restart_browser()
  80. continue
  81. src=driver.page_source
  82. h = html2text.HTML2Text()
  83. h.ignore_links = True
  84. txt=h.handle(src)
  85. resulttxt=clean_txt(txt)
  86. table.insert({'content':resulttxt,'url':url})
  87. time.sleep(5)
  88. # print()
  89. # break
  90. #print(html2text.html2text("<p><strong>Zed's</strong> dead baby, <em>Zed's</em> dead.</p>"))