clickjob.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. import time
  2. from datetime import datetime
  3. import json
  4. from selenium import webdriver
  5. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  6. import time
  7. import os
  8. import urllib.parse
  9. from selenium.webdriver.support.ui import WebDriverWait
  10. from selenium.webdriver.common.by import By
  11. from selenium.webdriver.support import expected_conditions as EC
  12. import codecs
  13. import random
  14. import requests
  15. import dataset
  16. import traceback
  17. import sys
  18. target_domain=['bennis.com.tw']
  19. brands={'bennis.com.tw':'班尼斯'}
  20. driver=None
  21. headers = {
  22. "Authorization": "Bearer " + "t35vhZtWNgvDNWHc3DJh0OKll3mcB9GvC8K2EAkBug2",
  23. "Content-Type": "application/x-www-form-urlencoded"
  24. }
  25. def send_msg(kw):
  26. params = {"message": "處理關鍵字: "+kw}
  27. r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params)
  28. def empty_query(q):
  29. global driver
  30. googleurl='https://www.google.com/search?q='+urllib.parse.quote(q)
  31. driver.get(googleurl)
  32. time.sleep(3)
  33. def process_query(qs):
  34. q=qs[0]
  35. domain=qs[1]
  36. global driver
  37. googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(urllib.parse.quote(q), 100,'zh-TW')
  38. print(googleurl)
  39. driver.get(googleurl)
  40. #time.sleep(6)
  41. # elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
  42. # ABOVE METHOD IS DEPRECATED STARTING SELENIUM 4.3.0, USE THIS
  43. elmts=driver.find_elements("xpath","//div[@class='yuRUbf']/a")
  44. idx=1
  45. ranking=-1
  46. domain_in_link = 0
  47. print(len(elmts))
  48. # driver.save_screenshot('c:/tmp/test.png')
  49. for elmt in elmts:
  50. href=elmt.get_attribute('href')
  51. txt=elmt.text
  52. if len(txt)>10:
  53. if domain in href:
  54. domain_in_link += 1
  55. print('clicked....')
  56. print(href)
  57. print(txt)
  58. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  59. webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  60. break
  61. if domain in target_domain:
  62. print("Target domain found")
  63. time_stamp = datetime.fromtimestamp(time.time())
  64. time_stamp = time_stamp.strftime("%Y-%m-%d %H:%M:%S")
  65. db['query_results'].insert({"time_stamp": time_stamp, "brand": brands[domain], "domain": domain, "query": q, "googleurl": googleurl, "element_count": len(elmts), "domain_in_link_count": domain_in_link})
  66. def run_once(q):
  67. global driver
  68. result=[]
  69. options = webdriver.ChromeOptions()
  70. options.add_argument('--headless')
  71. # options.add_argument("--user-agent=" +user_agent)
  72. options.add_argument("--incognito")
  73. options.add_argument('--no-sandbox')
  74. options.add_argument('--disable-dev-shm-usage')
  75. driver = webdriver.Chrome(
  76. options=options)
  77. driver.delete_all_cookies()
  78. driver.set_window_size(1400,1000)
  79. print(q)
  80. process_query(q)
  81. time.sleep(3)
  82. driver.quit()
  83. #lst=[{'kw':'幸福空間','domain':'hhh.com.tw','page':0}]
  84. lst=[]
  85. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  86. cursor=db.query('select term,domain from selected_kw order by rand()')
  87. for c in cursor:
  88. lst.append(c)
  89. #for c in lst:
  90. while True:
  91. try:
  92. c=random.choice(lst)
  93. run_once( (c['term'],c['domain']) )
  94. except:
  95. traceback.print_exc()
  96. sleepint=random.randint(20,40)
  97. time.sleep(sleepint)