ranking_day.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. import sys
  2. import dataset
  3. from selenium import webdriver
  4. import traceback
  5. import datetime
  6. import codecs
  7. import time
  8. import urllib
  9. import argparse
  10. import logging
  11. import sys
  12. from logging.handlers import SysLogHandler
  13. import socket
  14. import pandas as pd
  15. import random
  16. from selenium.webdriver.chrome.service import Service
  17. import os
  18. from random import randint
  19. import pymysql
  20. pymysql.install_as_MySQLdb()
  21. path = 'C:\portable\chromedriver'
  22. path_z = '/Users/zooeytsai/Downloads/chromedriver 2'
  23. driver = None
  24. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  25. lst = []
  26. table = db['google_rank']
  27. def rua():
  28. pool = [
  29. "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0",
  30. "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0",
  31. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
  32. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36",
  33. "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
  34. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
  35. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 OPR/68.0.3618.125",
  36. ]
  37. return random.choice(pool)
  38. def process_one(item):
  39. global driver
  40. term = item[0]
  41. domain = item[1]
  42. print(term, domain)
  43. escaped_search_term = urllib.parse.quote(term)
  44. googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(escaped_search_term, 100, 'zh-TW')
  45. print(googleurl)
  46. driver.get(googleurl)
  47. time.sleep(6)
  48. # fname=term.replace(' ','_')
  49. # driver.save_screenshot('c:/tmp/seo/'+fname+'.png')
  50. # df=pd.DataFrame()
  51. elmts = driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
  52. cnt = 1
  53. datadict = {'搜尋詞': [], '結果標題': [], '結果網址': [], '結果名次': []}
  54. if len(elmts) == 0:
  55. print('chrome異常')
  56. os.chdir('/root')
  57. os.system('python3 reboot.py')
  58. for elmt in elmts:
  59. try:
  60. href = elmt.get_attribute('href')
  61. datadict['搜尋詞'].append(term)
  62. datadict['結果標題'].append(elmt.text)
  63. datadict['結果網址'].append(href)
  64. datadict['結果名次'].append(str(cnt))
  65. if domain in href:
  66. print(href)
  67. print(elmt.text)
  68. table.insert(
  69. {'title': elmt.text, 'url': href, 'keyword': term, 'dt': datetime.datetime.now(), 'num': cnt})
  70. cnt += 1
  71. except:
  72. print('href2 exception')
  73. traceback.print_exc()
  74. if len(datadict['結果標題']) <= 0:
  75. print('None')
  76. driver.quit()
  77. sys.exit()
  78. # df['搜尋詞']=datadict['搜尋詞']
  79. # df['結果標題']=datadict['結果標題']
  80. # df['結果網址']=datadict['結果網址']
  81. # df['結果名次']=datadict['結果名次']
  82. #
  83. # df.to_excel('/Users/zooeytsai/'+fname+".xls")
  84. driver.quit()
  85. print('中場休息')
  86. time.sleep(randint(90, 120))
  87. def run_once(pport, item):
  88. global driver
  89. result = []
  90. s = Service('/root/driver/chromedriver')
  91. user_agent = rua()
  92. options = webdriver.ChromeOptions()
  93. options.add_argument('--headless')
  94. options.add_argument('--remote-debugging-port=9222')
  95. options.add_experimental_option("debuggerAddress", f"127.0.0.1:{pport}")
  96. options.add_argument("--user-agent=" + user_agent)
  97. options.add_argument("--incognito")
  98. driver = webdriver.Chrome(options=options, service=s)
  99. driver.delete_all_cookies()
  100. driver.set_window_size(1400, 1000)
  101. process_one(item)
  102. time.sleep(3)
  103. driver.quit()
  104. cursor = db.query('select term,domain from seo.selected_kw')
  105. for c in cursor:
  106. lst.append([c['term'], c['domain']])
  107. for i in lst:
  108. print('這裡', i)
  109. while True:
  110. try:
  111. os.system('docker container restart tiny9')
  112. time.sleep(1)
  113. run_once(9928, i)
  114. print('docker開啟完成')
  115. cur = db.query('select * from seo.google_rank order by id desc limit 1')
  116. for c in cur:
  117. kw = c['keyword']
  118. if kw != i[0]:
  119. print('稍等,上一筆待完成')
  120. time.sleep(60)
  121. break
  122. except:
  123. os.system('docker container restart tiny9')
  124. time.sleep(15)
  125. print('等待進行下一個關鍵字')
  126. time.sleep(5)