ranking_day2.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. from random import randint
  2. import sys
  3. import dataset
  4. from selenium import webdriver
  5. import traceback
  6. import datetime
  7. import codecs
  8. import time
  9. import urllib
  10. import argparse
  11. import logging
  12. import sys
  13. from logging.handlers import SysLogHandler
  14. import socket
  15. import pandas as pd
  16. #import pymysql
  17. #pymysql.install_as_MySQLdb()
  18. import random
  19. from selenium.webdriver.common.by import By
  20. from selenium.webdriver.chrome.service import Service
  21. from selenium.webdriver.support.ui import WebDriverWait
  22. import os
  23. import fire
  24. path = 'C:\portable\chromedriver'
  25. path_z = '/Users/zooeytsai/Downloads/chromedriver 2'
  26. driver = None
  27. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  28. lst = []
  29. table = db['google_rank']
  30. def rua():
  31. pool = [
  32. "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0",
  33. "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0",
  34. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
  35. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36",
  36. "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
  37. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
  38. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 OPR/68.0.3618.125",
  39. ]
  40. return random.choice(pool)
  41. def process_one(item):
  42. global driver
  43. term = item[0]
  44. domain = item[1]
  45. print(term, domain)
  46. escaped_search_term = urllib.parse.quote(term)
  47. googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(escaped_search_term, 100, 'zh-TW')
  48. print(googleurl)
  49. driver.get(googleurl)
  50. time.sleep(6)
  51. # fname=term.replace(' ','_')
  52. # driver.save_screenshot('c:/tmp/seo/'+fname+'.png')
  53. # df=pd.DataFrame()
  54. elmts = driver.find_elements(By.XPATH,"//div[@class='yuRUbf']/a")
  55. cnt = 1
  56. clickelmt=None
  57. datadict = {'搜尋詞': [], '結果標題': [], '結果網址': [], '結果名次': []}
  58. print('搜尋結果',len(elmts))
  59. for elmt in elmts:
  60. try:
  61. href = elmt.get_attribute('href')
  62. datadict['搜尋詞'].append(term)
  63. datadict['結果標題'].append(elmt.text)
  64. datadict['結果網址'].append(href)
  65. datadict['結果名次'].append(str(cnt))
  66. if domain in href:
  67. clickelmt = elmt
  68. print(href)
  69. print(elmt.text)
  70. table.insert(
  71. {'kw': term, 'domain': domain, 'ranking': cnt, 'title': elmt.text, 'url': href,'dt': datetime.datetime.now()})
  72. # webdriver.ActionChains(driver).move_to_element(elmt).perform()
  73. # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  74. cnt += 1
  75. except:
  76. print('href2 exception')
  77. traceback.print_exc()
  78. if clickelmt:
  79. webdriver.ActionChains(driver).move_to_element(clickelmt).perform()
  80. webdriver.ActionChains(driver).move_to_element(clickelmt).click().perform()
  81. time.sleep(5)
  82. print('點擊完成')
  83. if len(datadict['結果標題']) <= 0:
  84. print('None')
  85. driver.quit()
  86. sys.exit()
  87. # df['搜尋詞']=datadict['搜尋詞']
  88. # df['結果標題']=datadict['結果標題']
  89. # df['結果網址']=datadict['結果網址']
  90. # df['結果名次']=datadict['結果名次']
  91. #
  92. # df.to_excel('/Users/zooeytsai/'+fname+".xls")
  93. driver.quit()
  94. print('結束')
  95. def run_once(q):
  96. global driver
  97. result = []
  98. s = Service('/root/driver/chromedriver')
  99. user_agent = rua()
  100. options = webdriver.ChromeOptions()
  101. options.add_argument('--headless')
  102. options.add_argument('--remote-debugging-port=9222')
  103. options.add_experimental_option("debuggerAddress", f"127.0.0.1:{q[2]}")
  104. options.add_argument("--user-agent=" + user_agent)
  105. options.add_argument("--incognito")
  106. driver = webdriver.Chrome(options=options, service=s)
  107. driver.delete_all_cookies()
  108. driver.set_window_size(1400, 1000)
  109. process_one(q)
  110. time.sleep(3)
  111. driver.quit()
  112. class JParams(object):
  113. def get(self, kw, domain, port):
  114. print(kw)
  115. print(domain)
  116. run_once((kw, domain, port))
  117. if __name__ == '__main__':
  118. fire.Fire(JParams)