123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311 |
- #import redis
- import time
- import traceback
- #import json
- from selenium import webdriver
- from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
- import time
- import urllib
- import os
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support import expected_conditions as EC
- import dataset
- from selenium.webdriver.common.keys import Keys
- import json
- import random
- import time
- import redis
- import sys
- import codecs
- import random
- import os
- import time
- import requests
- import datetime
- driver=None
- import pymysql
- pymysql.install_as_MySQLdb()
- from fake_useragent import UserAgent
- ua = UserAgent()
- #proxy_enabled=True
- # proxy_enabled=False
- # # https://youtu.be/cR2M5Khgxvc
- # db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
- # glog_table=db['general_log']
- def re_get_webdriver():
- # global port
- global driver
- global portnum
- # os.system('killall chrome')
- result=[]
- # if driver is not None:
- # print('closing....')
- # driver.quit()
- # print('quit....')
- # driver=None
- # os.system()
- options = webdriver.ChromeOptions()
- options.add_argument("--user-agent=" +ua.random)
- options.add_argument("--no-sandbox")
- options.add_argument("--headless")
- options.add_argument("--incognito")
- driver = webdriver.Remote(
- command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',
- options=options)
- return driver
- # try:
- # options = webdriver.ChromeOptions()
-
- # options.add_argument("--no-sandbox")
- # options.add_argument("--headless")
- # options.add_argument("--incognito")
- # # if proxy_enabled:
- # # options.add_argument('--proxy-server=socks5://172.104.92.245:14900')
- # try:
- # driver = webdriver.Remote(
- # command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',
- # options=options)
- # except:
- # traceback.print_exc()
- # return None
- # return driver
- # except:
- # traceback.print_exc()
- # driver=None
- # return None
- # return driver
- def run_once():
- global count
- global bok
- global portnum
- # global glog_table
- # table=db['nda_log']
- # print(jsobj)
- # kw=jsobj['kw']
-
- # options = webdriver.ChromeOptions()
-
- # options.add_argument("--no-sandbox")
- # options.add_argument("--headless")
- # options.add_argument("--incognito")
- # driver = webdriver.Remote(
- # command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',
- # options=options)
-
- # if driver is not None:
- # break
-
- ettoday_url_list=[]
- #db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
- db = dataset.connect('mysql://choozmo:pAssw0rd@172.105.194.225:3306/seo?charset=utf8mb4')
- cursor=db.query('SELECT * FROM columnids order by rand()')
- for c in cursor:
- #lst.append('https://www.hhh.com.tw/columns/detail/'+str(c['cid'])+'/')
- #lst.append('https://m.hhh.com.tw/columns/detail/'+str(c['cid'])+'/')
- #ettoday_url_list.append('https://m.hhh.com.tw/HHH_NEW/columns_detail/'+str(c['cid'])+'.php')
- ettoday_url_list.append('https://hhh.com.tw/HHH_NEW/columns_detail/'+str(c['cid'])+'.php?utm_source=choozmo&utm_medium=banner&utm_campaign=choozmo')
-
- # try:
- for i in ettoday_url_list:
- try:
- driver=re_get_webdriver()
- except:
- print('driver broken')
- portnum=random.randint(1399,1599)
- print(portnum)
- os.system('docker container stop p9916')
- time.sleep(5)
- os.system('docker container rm p9916')
- time.sleep(5)
- os.system('docker run -d -p '+str(portnum)+':4444 --name p9916 --shm-size=500M --dns 168.95.1.1 selenium/standalone-chrome:106.0')
- bok +=1
- count=0
- time.sleep(5)
- driver=re_get_webdriver()
- time.sleep(3)
- try:
- driver.get(i)
- time.sleep(3)
- #html = driver.page_source
- #with open('log1.txt', 'a+', encoding='UTF-8') as f:
- #f.write(html)
- #f.write("="*25)
- #elmt_next = driver.find_element(By.XPATH, '//*[@id="house"]/div[3]/div[2]/div[6]/div/div/div[1]/article/div/div[3]/p[1]/a')
- #webdriver.ActionChains(driver).move_to_element(elmt_next).perform()
- #time.sleep(0.5)
- #webdriver.ActionChains(driver).move_to_element(elmt_next).click().perform()
- print("cick!",i)
- count+=1
- print("count_time:",count,';broken_time:',bok)
- # elmt = driver.find_element(By.XPATH, '//*[@id="yschsp"]')
- time.sleep(random.randint(3,5))
- #driver.close()
- driver.quit()
- except Exception as e:
- #print(e)
- #with open('log1.txt', 'a+', encoding='UTF-8') as f:
- #f.write(e.msg)
- #f.write(e.args)
- #driver.close()
- try:
- driver.quit()
- except:
- print('no have driver')
- print("wrong",i,';broken_time:',bok)
- time.sleep(5)
- # except:
- # print('wrong for:',i)
- # kw=jsobj['kw']
- # if jsobj.get('domain') is None:
- # exclude=jsobj['exclude']
- # domain=None
- # else:
- # domain=jsobj['domain']
- # exclude=None
- # driver.get('https://www.google.com?num=100')
- # time.sleep(17)
- # while True:
- # try:
- # print(driver.current_url)
- # break
- # except:
- # traceback.print_exc()
- # driver=re_get_webdriver()
- # time.sleep(3)
- # driver.get('https://www.google.com?num=100')
- # time.sleep(3)
- # time.sleep(3)
- # elmt = driver.find_element(By.XPATH, "//input[@name='q']")
- # time.sleep(1)
- # elmt.send_keys(kw)
- # elmt.send_keys(Keys.ENTER)
- # time.sleep(6)
- # elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']/a")
- # numresults=len(elmts)
- # # time.sleep(9999)
- # print('搜尋結果數量',numresults)
- # if numresults==0:
- # print(driver.current_url)
- # print(driver.title)
- # sys.exit()
- # idx=1
- # found=False
- # test_lst=[]
- # for elmt in elmts:
- # href=elmt.get_attribute('href')
- # txt=elmt.text
- # if len(txt)>10:
- # if domain is not None:
- # for d in domain:
- # if d in href:
- # print('found....')
- # print('clicked....')
- # print(href)
- # print(txt)
- # print("ranking", idx)
- # found=True
- # webdriver.ActionChains(driver).move_to_element(elmt).perform()
- # # elmt.click()
- # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
- # table.insert({'kw':kw,'ranking':idx,'title':txt,'url':href,'dt':datetime.datetime.now(),'result':numresults,'client':'64G'})
- # time.sleep(6)
- # return
- # else:
- # if exclude not in href:
- # test_lst.append(elmt)
-
- # idx+=1
- # if exclude is not None:
- # print('exclude')
- # elmt=random.choice(test_lst)
- # print(elmt)
- # webdriver.ActionChains(driver).move_to_element(elmt).perform()
- # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
- # time.sleep(5)
- # if not found:
- # table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄'})
- # except:
- # traceback.print_exc()
- # print('exception')
- # traceback.print_exc()
-
- # time.sleep(5)
- # r=random.randint(0,27)
- # r=26
- # cursor=db.query('select json from seo_jobs where cust="KNIGHT" and plan="形象SEO" order by rand() limit 1')
- # for c in cursor:
- # js=json.loads(c['json'])
- # prefix=js['prefix']
- # postfix=js['postfix']
- # domain=js['domain'][0]
- # positive=js['positive']
- # rnd=js['rnd']
- portnum=random.randint(1399,1599)
- print(portnum)
- os.system('docker container stop p9916')
- time.sleep(5)
- os.system('docker container rm p9916')
- time.sleep(5)
- os.system('docker run -d -p '+str(portnum)+':4444 --name p9916 --shm-size=500M --dns 168.95.1.1 selenium/standalone-chrome:106.0')
- bok = 0
- count=0
- time.sleep(5)
- while True:
- # run_once()
- # time.sleep(10)
- try:
- run_once()
- except:
- bok+=1
- print('broken')
- time.sleep(5)
- # kw=random.choice(positive)
- # kw2=random.choice(rnd)
- # count=0
- # while True:
- # try:
- # run_once({'domain':domain,'kw':prefix+" "+kw+" "+kw2})
- # count+=1
- # except:
- # continue
- # print('中場休息 次數',count)
- # time.sleep(random.randint(120,150))
|