click_hhh.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. #import redis
  2. import time
  3. import traceback
  4. #import json
  5. from selenium import webdriver
  6. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  7. import time
  8. import urllib
  9. import os
  10. from selenium.webdriver.support.ui import WebDriverWait
  11. from selenium.webdriver.common.by import By
  12. from selenium.webdriver.support import expected_conditions as EC
  13. import dataset
  14. from selenium.webdriver.common.keys import Keys
  15. import json
  16. import random
  17. import time
  18. import redis
  19. import sys
  20. import codecs
  21. import random
  22. import os
  23. import time
  24. import requests
  25. driver=None
  26. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  27. def re_get_webdriver():
  28. global port
  29. global driver
  30. global portnum
  31. result=[]
  32. if driver is not None:
  33. print('closing....')
  34. driver.quit()
  35. print('quit....')
  36. driver=None
  37. try:
  38. options = webdriver.ChromeOptions()
  39. options.add_argument("--no-sandbox")
  40. options.add_argument("--headless")
  41. options.add_argument("--incognito")
  42. try:
  43. driver = webdriver.Remote(
  44. command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',
  45. options=options)
  46. except:
  47. traceback.print_exc()
  48. return None
  49. return driver
  50. except:
  51. traceback.print_exc()
  52. driver=None
  53. return None
  54. return driver
  55. def run_once(jsobj):
  56. table=db['rank_detection']
  57. print(jsobj)
  58. kw=jsobj['kw']
  59. i=100
  60. while True:
  61. driver=re_get_webdriver()
  62. if driver is not None:
  63. break
  64. time.sleep(3)
  65. try:
  66. kw=jsobj['kw']
  67. if jsobj.get('domain') is None:
  68. exclude=jsobj['exclude']
  69. domain=None
  70. else:
  71. domain=jsobj['domain']
  72. exclude=None
  73. driver.get('https://www.google.com?num=100')
  74. time.sleep(17)
  75. while True:
  76. try:
  77. print(driver.current_url)
  78. break
  79. except:
  80. traceback.print_exc()
  81. driver=re_get_webdriver()
  82. time.sleep(3)
  83. driver.get('https://www.google.com?num=100')
  84. time.sleep(3)
  85. time.sleep(3)
  86. elmt = driver.find_element(By.XPATH, "//input[@name='q']")
  87. time.sleep(1)
  88. elmt.send_keys(kw)
  89. elmt.send_keys(Keys.ENTER)
  90. time.sleep(6)
  91. elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']/a")
  92. numresults=len(elmts)
  93. # time.sleep(9999)
  94. print('搜尋結果數量',numresults)
  95. if numresults==0:
  96. print(driver.current_url)
  97. print(driver.title)
  98. sys.exit()
  99. idx=1
  100. found=False
  101. test_lst=[]
  102. for elmt in elmts:
  103. href=elmt.get_attribute('href')
  104. txt=elmt.text
  105. if len(txt)>10:
  106. if domain is not None:
  107. for d in domain:
  108. if d in href:
  109. print('found....')
  110. print('clicked....')
  111. print(href)
  112. print(txt)
  113. print("ranking", idx)
  114. found=True
  115. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  116. # elmt.click()
  117. webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  118. table.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt})
  119. time.sleep(6)
  120. return
  121. else:
  122. if exclude not in href:
  123. test_lst.append(elmt)
  124. idx+=1
  125. if exclude is not None:
  126. print('exclude')
  127. elmt=random.choice(test_lst)
  128. print(elmt)
  129. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  130. webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  131. time.sleep(5)
  132. if not found:
  133. table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄'})
  134. except:
  135. traceback.print_exc()
  136. print('exception')
  137. traceback.print_exc()
  138. driver.quit()
  139. time.sleep(5)
  140. r=random.randint(0,2)
  141. if r==0 or r==1:
  142. prefix="幸福空間 "
  143. postfix=' site:hhh.com.tw'
  144. domain=['hhh.com.tw']
  145. positive=['','設計','設計師','室內 設計','裝潢','室內 裝修','設計 公司','裝潢','北歐風']
  146. if r==2:
  147. prefix=""
  148. postfix=' site:hhh.com.tw'
  149. domain=['hhh.com.tw']
  150. positive=['艾立思','','艾立思軟裝','艾立思集團','御見設計','艾立思 家具訂製','艾立思 精品家具','艾立思 軟裝飾品','ELIZ','艾立思 郭柏君','艾立思 家配師','艾立思 郭柏君']
  151. portnum=random.randint(4444,4555)
  152. print(portnum)
  153. os.system('docker container stop p4444')
  154. time.sleep(1)
  155. os.system('docker container rm p4444')
  156. time.sleep(1)
  157. os.system('docker run -d -p '+str(portnum)+':4444 --name p4444 --dns 168.95.1.1 selenium/standalone-chrome:103.0')
  158. time.sleep(18)
  159. kw=random.choice(positive)
  160. run_once({'domain':domain,'kw':prefix+" "+kw})