clickbot_100.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. import random
  2. import sys
  3. import dataset
  4. from selenium import webdriver
  5. from selenium.webdriver.chrome.service import Service
  6. from selenium.webdriver.common.by import By
  7. import traceback
  8. import datetime
  9. import codecs
  10. import time
  11. import urllib
  12. import argparse
  13. import logging
  14. import sys
  15. from logging.handlers import SysLogHandler
  16. import socket
  17. import pandas as pd
  18. import socket
  19. import os
  20. import dataset
  21. import pymysql
  22. pymysql.install_as_MySQLdb()
  23. driver = None
  24. def restart_browser():
  25. global driver
  26. user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
  27. s = Service('/Users/mac/Downloads/123/chromedriver')
  28. options = webdriver.ChromeOptions()
  29. options.add_argument('--headless')
  30. # options.add_argument('--remote-debugging-port=9222')
  31. # options.add_experimental_option("debuggerAddress", "192.168.192.45:9922")
  32. options.add_argument("--user-agent=" + user_agent)
  33. options.add_argument("--incognito")
  34. # options.add_argument('--proxy-server=socks5://172.104.93.163:41800')
  35. driver = webdriver.Chrome(
  36. options=options, service=s)
  37. str1 = driver.capabilities['browserVersion']
  38. print('版本', str1)
  39. driver.delete_all_cookies()
  40. driver.set_window_size(1400, 20000)
  41. return driver
  42. def process_one():
  43. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  44. lst = []
  45. table=db['google_rank']
  46. # cursor = db.query('select term,domain from seo.selected_kw where client="新飛0721"')
  47. # cursor = db.query('select term,url from seo.sns_kw where client="理茶"')
  48. cursor = db.query('select term,domain from seo.selected_kw where client="歌林"')
  49. # cursor = db.query('select term,domain from seo.selected_kw where id between 3367 and 3375')
  50. # cursor = db.query('select term,domain from seo.select_kw where client="幸福空間"')
  51. # cursor=db.query('select term from selected_kw and term not in (SELECT distinct(keyword) FROM ig_tags.save_result where url like "%beastparadise.net%" and datediff(now(),dt)=0)')
  52. for c in cursor:
  53. lst.append([c['term'],c['domain']])
  54. # lst.append([c['term'],c['url']])
  55. # lst.append(c['term'])
  56. # domain = 'fleurancenature.tw'
  57. for i in lst:
  58. print(i)
  59. driver=restart_browser()
  60. escaped_search_term=urllib.parse.quote(i[0])
  61. googleurl = 'https://www.google.com/search?q={}&num={}&hl={}&gl=tw'.format(escaped_search_term, 100,'zh-TW')
  62. driver.get(googleurl)
  63. print(driver.current_url)
  64. # driver.save_screenshot('C:\/Users\/s1301\/Pictures\/Saved Pictures\/新飛\/0721\/'+i[0]+'.png')
  65. df=pd.DataFrame()
  66. elmts=driver.find_elements(By.XPATH,"//div[@class='yuRUbf']//a")
  67. print('結果數量',len(elmts))
  68. cnt=1
  69. datadict={'搜尋詞':[],'結果標題':[],'結果網址':[],'結果名次':[]}
  70. for elmt in elmts:
  71. try:
  72. href=elmt.get_attribute('href')
  73. if i[1] in href:
  74. table.insert({'title':elmt.text,'url':href,'keyword':i[0],'dt':datetime.datetime.now(),'ranking':cnt})
  75. print(href)
  76. print(elmt.text)
  77. # if 'hhh.com.tw' in href:
  78. # table.insert({'title': elmt.text, 'url': href, 'keyword': i, 'dt': datetime.datetime.now(),
  79. # 'ranking': cnt})
  80. # print(href)
  81. # print(elmt.text)
  82. datadict['搜尋詞'].append(i[0])
  83. datadict['結果標題'].append(elmt.text)
  84. datadict['結果網址'].append(href)
  85. datadict['結果名次'].append(str(cnt))
  86. cnt+=1
  87. except:
  88. print('href2 exception')
  89. traceback.print_exc()
  90. df['搜尋詞']=datadict['搜尋詞']
  91. df['結果標題']=datadict['結果標題']
  92. df['結果網址']=datadict['結果網址']
  93. df['結果名次']=datadict['結果名次']
  94. # df.to_excel('C:\/Users\/s1301\/Pictures\/Saved Pictures\/芙樂思\/0720\/'+i+'.png')
  95. driver.quit()
  96. print('等待')
  97. time.sleep(random.randint(70,90))
  98. db.close()
  99. process_one()