click_and_save.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. import random
  2. import sys
  3. import dataset
  4. from selenium import webdriver
  5. import traceback
  6. import datetime
  7. import codecs
  8. import time
  9. import urllib
  10. import argparse
  11. import schedule
  12. import logging
  13. import sys
  14. from logging.handlers import SysLogHandler
  15. import socket
  16. import pandas as pd
  17. _LOG_SERVER = ('hhh.ptt.cx', 514)
  18. logger = logging.getLogger('clickbot_100')
  19. handler1 = SysLogHandler(address=_LOG_SERVER,socktype=socket.SOCK_DGRAM)
  20. logger.addHandler(handler1)
  21. logger.debug('[click_and_save][DB]begin')
  22. def restart_browser():
  23. options = webdriver.ChromeOptions()
  24. options.add_argument('--headless')
  25. driver=webdriver.Chrome(options=options)
  26. driver.set_window_size(950,6000)
  27. return driver
  28. def process_one():
  29. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  30. lst=[]
  31. table=db['save_result_listclick']
  32. cursor=db.query('select kw,page,domain from seo_clickjobs where category="202204" order by rand()')
  33. for c in cursor:
  34. lst.append(c)
  35. entry=random.choice(lst)
  36. term=entry['kw']
  37. print(term)
  38. domain=entry['domain']
  39. logger.debug('[clickbot_100]['+term+']')
  40. driver=restart_browser()
  41. escaped_search_term=urllib.parse.quote(term)
  42. googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(escaped_search_term, 100,'zh-TW')
  43. print(googleurl)
  44. driver.get(googleurl)
  45. time.sleep(6)
  46. fname=term.replace(' ','_')
  47. df=pd.DataFrame()
  48. elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
  49. clickelmt=None
  50. cnt=1
  51. datadict={'搜尋詞':[],'結果標題':[],'結果網址':[],'結果名次':[]}
  52. for elmt in elmts:
  53. try:
  54. href=elmt.get_attribute('href')
  55. if domain in href:
  56. clickelmt=elmt
  57. logger.debug('[clickbot_100]['+term+']['+str(cnt)+']')
  58. print(href)
  59. print(elmt.text)
  60. datadict['搜尋詞'].append(term)
  61. datadict['結果標題'].append(elmt.text)
  62. datadict['結果網址'].append(href)
  63. datadict['結果名次'].append(str(cnt))
  64. table.insert({'title':elmt.text,'url':href,'keyword':term,'dt':datetime.datetime.now(),'num':cnt})
  65. cnt+=1
  66. except:
  67. print('href2 exception')
  68. traceback.print_exc()
  69. if clickelmt:
  70. webdriver.ActionChains(driver).move_to_element(clickelmt).perform()
  71. webdriver.ActionChains(driver).move_to_element(clickelmt).click().perform()
  72. if len(datadict['結果標題'])<=0:
  73. print('None')
  74. driver.quit()
  75. sys.exit()
  76. df['搜尋詞']=datadict['搜尋詞']
  77. df['結果標題']=datadict['結果標題']
  78. df['結果網址']=datadict['結果網址']
  79. df['結果名次']=datadict['結果名次']
  80. driver.quit()
  81. process_one()
  82. parser = argparse.ArgumentParser()
  83. parser.add_argument('--loop')
  84. args = parser.parse_args()
  85. if args.loop:
  86. # schedule.every(6).minutes.do(process_one)
  87. schedule.every(0.4).minutes.do(process_one)
  88. while True:
  89. schedule.run_pending()
  90. time.sleep(1)