gsearch_selenium.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. from selenium import webdriver
  2. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  3. import time
  4. import os
  5. import datetime
  6. import urllib.parse
  7. from selenium.webdriver.support.ui import WebDriverWait
  8. from selenium.webdriver.common.by import By
  9. from selenium.webdriver.support import expected_conditions as EC
  10. import codecs
  11. import random
  12. from bs4 import BeautifulSoup
  13. import requests
  14. import time
  15. import rpyc
  16. import sys
  17. import docker
  18. import googlesearch
  19. import codecs
  20. import sys
  21. import time
  22. import dataset
  23. import os
  24. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/hhh?charset=utf8mb4')
  25. #cursor=db.query('SELECT kw FROM hhh.hhh_contentgap_serp where ranking is not null;')
  26. #cursor=db.query('SELECT kw FROM hhh.hhh_contentgap_serp where kw not in (select distinct kw from hhh_contentgap_serp where id >= 155)')
  27. kwlst={}
  28. #for c in cursor:
  29. # kwlst[c['kw']]=1
  30. table=db['hhh_contentgap_serp']
  31. curdir=os.path.realpath('.')
  32. #fr=codecs.open(curdir+os.sep+'contentgap.txt','r','utf-8')
  33. #fr=codecs.open(curdir+os.sep+'hhh\\seo\\contentgap.txt','r','utf-8')
  34. #fr=codecs.open('C:\\gitlab\\kw_tools\\kw_tools\\hhh\\SEO\\contentgap.txt','r','utf-8')
  35. #lines=fr.readlines()
  36. lst=[]
  37. #for l in lines:
  38. # lst.append(l.replace('\n',''))
  39. #
  40. cursor=db.query('select term from hhh.contentgap_terms where term not in (SELECT kw FROM hhh.hhh_contentgap_serp where datediff(now(),dt) =0 and ranking is not null )')
  41. for c in cursor:
  42. lst.append(c['term'])
  43. headers = {
  44. "Authorization": "Bearer " + "t35vhZtWNgvDNWHc3DJh0OKll3mcB9GvC8K2EAkBug2",
  45. "Content-Type": "application/x-www-form-urlencoded"
  46. }
  47. def send_msg(kw):
  48. params = {"message": "處理關鍵字: "+kw}
  49. r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params)
  50. def empty_query(q):
  51. global driver
  52. googleurl='https://www.google.com/search?q='+urllib.parse.quote(q)
  53. driver.get(googleurl)
  54. time.sleep(3)
  55. def process_query(qs,number_results=10,language_code='en',pat='hhh.com.tw'):
  56. global driver
  57. escaped_search_term=urllib.parse.quote(qs)
  58. # escaped_search_term = qs.replace(' ', '+')
  59. # googleurl='https://www.google.com/search?q='+
  60. googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(escaped_search_term, number_results+1,language_code)
  61. driver.get(googleurl)
  62. elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
  63. idx=0
  64. for elmt in elmts:
  65. try:
  66. href=elmt.get_attribute('href')
  67. print(str(idx)+': '+href)
  68. if pat in href:
  69. return idx
  70. idx+=1
  71. except:
  72. print('href exception')
  73. try:
  74. elmt=driver.find_element_by_xpath("//a[@id='pnnext']")
  75. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  76. webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  77. except:
  78. print('pnnext exception')
  79. return None
  80. time.sleep(4)
  81. elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
  82. for elmt in elmts:
  83. try:
  84. href=elmt.get_attribute('href')
  85. print(str(idx)+': '+href)
  86. if pat in href:
  87. return idx
  88. idx+=1
  89. except:
  90. print('href2 exception')
  91. result=[]
  92. driver=None
  93. def restart_browser():
  94. os.system('docker container restart p4444')
  95. # client = docker.from_env()
  96. # ls=client.containers.list()
  97. # print(ls)
  98. # ls[0].restart()
  99. time.sleep(10)
  100. options = webdriver.ChromeOptions()
  101. # options.add_argument("--proxy-server=socks5://130.61.93.198:1080")
  102. #driver=webdriver.Chrome(desired_capabilities=options.to_capabilities())
  103. driver = webdriver.Remote(
  104. command_executor='http://127.0.0.1:4444/wd/hub',
  105. # command_executor='http://172.104.93.163:4444/wd/hub',
  106. #command_executor='http://dev2.choozmo.com:14444/wd/hub',
  107. desired_capabilities=options.to_capabilities())
  108. # desired_capabilities=DesiredCapabilities.CHROME)
  109. driver.set_window_size(1400,1000)
  110. return driver
  111. for l in lst:
  112. #for l in lst[21:]:
  113. #for l in lst[32:]:
  114. #for l in lst[42:]:
  115. if True:
  116. # if kwlst.get(l) is None:
  117. driver=restart_browser()
  118. # l='房間 油漆'
  119. # idx=process_query(,number_results=100,language_code='zh-TW',pat='hhh.com.tw')
  120. idx=process_query(l,number_results=100,language_code='zh-TW',pat='hhh.com.tw')
  121. # if idx is None:
  122. # sys.exit()
  123. print({'kw':l,'ranking':idx})
  124. if idx==None:
  125. print(driver.page_source)
  126. if '我們的系統偵測到您的電腦網路送出的流量有異常情況' in driver.page_source:
  127. print('baned.....')
  128. sys.exit()
  129. table.insert({'kw':l,'ranking':idx,'dt':datetime.datetime.now()})
  130. db.commit()
  131. # time.sleep(9999)
  132. # time.sleep(4)