123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128 |
- import random
- import sys
- import dataset
- from selenium import webdriver
- import traceback
- import datetime
- import codecs
- import time
- import urllib
- import argparse
- import schedule
- import logging
- import sys
- from logging.handlers import SysLogHandler
- import socket
- import pandas as pd
- import socket
- import os
- _LOG_SERVER = ('hhh.ptt.cx', 514)
- logger = logging.getLogger('clickbot_100')
- handler1 = SysLogHandler(address=_LOG_SERVER,socktype=socket.SOCK_DGRAM)
- logger.addHandler(handler1)
- #logger.debug('[clickbot_100][清原]begin')
- hname=socket.gethostname()
- pid=str(os.getpid())
- logger.fatal('[clickbot_100]['+hname+']['+pid+']begin')
- def restart_browser():
- os.system('docker container restart headless-shell')
- time.sleep(9)
- chrome_options = webdriver.ChromeOptions()
- # chrome_options.add_argument("--incognito")
- chrome_options.add_argument("--headless")
- chrome_options.add_argument("--no-sandbox")
- chrome_options.add_argument("--disable-dev-shm-usage")
- chrome_options.add_argument("start-maximized")
- chrome_options.add_argument("user-data-dir=/tmp")
- chrome_options.debugger_address="127.0.0.1:9222"
- chrome98=r'C:\portable\webdriver\chrome98\chromedriver.exe'
- # chrome98=r'/root/drivers/98/chromedriver'
- driver = webdriver.Chrome(chrome_options=chrome_options,executable_path=chrome98)
-
- return driver
- def process_one():
- db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
- lst=[]
- table=db['save_result']
- cursor=db.query('select term from selected_kw where client="清原" and term not in (SELECT distinct(keyword) FROM seo.save_result where url like "%taroboba-yuan.com%" and datediff(now(),dt)=0)')
- for c in cursor:
- lst.append(c['term'])
- term=random.choice(lst)
- print(term)
- logger.debug('[clickbot_100]['+term+']')
- driver=restart_browser()
- escaped_search_term=urllib.parse.quote(term)
- googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(escaped_search_term, 100,'zh-TW')
- print(googleurl)
- driver.get(googleurl)
- time.sleep(6)
- driver.save_screenshot('c:/tmp/test.png')
- fname=term.replace(' ','_')
- driver.save_screenshot('c:/tmp/seo/'+fname+'.png')
- df=pd.DataFrame()
- elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
- clickelmt=None
- cnt=1
- datadict={'搜尋詞':[],'結果標題':[],'結果網址':[],'結果名次':[]}
-
- for elmt in elmts:
- try:
- href=elmt.get_attribute('href')
- if 'taroboba-yuan.com' in href:
- clickelmt=elmt
- logger.debug('[clickbot_100]['+term+']['+str(cnt)+']')
- print(href)
- print(elmt.text)
- datadict['搜尋詞'].append(term)
- datadict['結果標題'].append(elmt.text)
- datadict['結果網址'].append(href)
- datadict['結果名次'].append(str(cnt))
- table.insert({'title':elmt.text,'url':href,'keyword':term,'dt':datetime.datetime.now(),'num':cnt})
- cnt+=1
- except:
- print('href2 exception')
- traceback.print_exc()
- if clickelmt:
- webdriver.ActionChains(driver).move_to_element(clickelmt).perform()
- webdriver.ActionChains(driver).move_to_element(clickelmt).click().perform()
- if len(datadict['結果標題'])<=0:
- print('None')
- driver.quit()
- sys.exit()
- df['搜尋詞']=datadict['搜尋詞']
- df['結果標題']=datadict['結果標題']
- df['結果網址']=datadict['結果網址']
- df['結果名次']=datadict['結果名次']
- df.to_excel('c:/tmp/seo/'+fname+".xls")
- driver.quit()
- process_one()
- parser = argparse.ArgumentParser()
- parser.add_argument('--loop')
- args = parser.parse_args()
- if args.loop:
- schedule.every(6).minutes.do(process_one)
- # schedule.every(0.4).minutes.do(process_one)
- while True:
- schedule.run_pending()
- time.sleep(1)
|