123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291 |
- #import redis
- import time
- import traceback
- #import json
- from selenium import webdriver
- from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
- import time
- #import urllib
- import os
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support import expected_conditions as EC
- import dataset
- from selenium.webdriver.common.keys import Keys
- import json
- import random
- import time
- #import redis
- import sys
- import codecs
- import random
- import os
- import time
- import requests
- driver=None
- dockername='p4444'
- is_docker=True
- #is_docker=False
- db = dataset.connect('postgresql://postgres:eyJhbGciOiJI@172.105.241.163:5432/postgres')
- #db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
- if is_docker:
- portnum=random.randint(4444,4555)
- print(portnum)
- os.system('docker container stop '+dockername)
- time.sleep(0.5)
- os.system('docker container rm '+dockername)
- time.sleep(0.5)
- os.system('docker run -d -p '+str(portnum)+':4444 --shm-size=2g --name '+dockername+' --dns 168.95.1.1 selenium/standalone-chrome:103.0')
- time.sleep(7)
- def re_get_webdriver():
- global port
- global driver
- global portnum
- global is_docker
- result=[]
- if driver is not None:
- print('closing....')
- driver.quit()
- print('quit....')
- driver=None
- try:
- options = webdriver.ChromeOptions()
- options.add_argument("--no-sandbox")
- options.add_argument("--headless")
- options.add_argument("--incognito")
- # options.add_argument('--proxy-server=socks5://172.104.92.245:14900')
- mobile_emulation = {
- "deviceMetrics": { "width": 360, "height": 640, "pixelRatio": 3.0 },
- "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19" }
- # options.add_experimental_option("mobileEmulation", mobile_emulation)
- if is_docker:
- try:
- driver = webdriver.Remote(
- command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',
- options=options)
- except:
- traceback.print_exc()
- time.sleep(9999)
- return None
- return driver
- try:
- driver = webdriver.Chrome(options=options)
- except:
- traceback.print_exc()
- return None
- return driver
- except:
- traceback.print_exc()
- driver=None
- return None
- return driver
- def run_once(jsobj):
- # table=db['seo_jobs_ranking']
- # table=db['seo_test_jobs_ranking']
- table=db['seo_jobs']
- # history=db['seo_search_history']
- print(jsobj)
- kw=jsobj['kw']
- i=100
- while True:
- driver=re_get_webdriver()
- print('re_get_webdriver')
- if driver is not None:
- break
- time.sleep(3)
- try:
- kw=jsobj['kw']
- if jsobj.get('domain') is None:
- exclude=jsobj['exclude']
- domain=None
- else:
- domain=jsobj['domain']
- exclude=None
- driver.get('https://www.google.com?num=100')
- time.sleep(1)
- while True:
- try:
- print(driver.current_url)
- break
- except:
- traceback.print_exc()
- driver=re_get_webdriver()
- time.sleep(3)
- driver.get('https://www.google.com?num=100')
- # time.sleep(3)
- time.sleep(3)
- # elmt = driver.find_element(By.XPATH, "//input[@name='q']")
- elmt = driver.find_element(By.XPATH, "//textarea[@type='search']")
- time.sleep(1)
- elmt.send_keys(kw)
- elmt.send_keys(Keys.ENTER)
- time.sleep(3)
- # elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']/a")
- elmts = driver.find_elements(By.XPATH, "//a[@jsname='UWckNb']")
- numresults=len(elmts)
-
- print('搜尋結果數量',numresults)
- if numresults==0:
- print(driver.current_url)
- print(driver.title)
- sys.exit()
- # time.sleep(9999)
- idx=1
- found=False
- test_lst=[]
- clickelmt=None
- neg_count=0
- neg_total=0
- clickidx=0
- clickhref=''
- clicktitle=''
- for elmt in elmts:
- href=elmt.get_attribute('href')
- txt=elmt.text
- # history.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt})
- if '坑殺' in txt or '侵占' in txt or '判決書' in txt or '強佔' in txt or '掏空' in txt or '送達公告' in txt or '違反勞動'in txt or '不倫' in txt or '開房' in txt or '摩鐵' in txt:
- neg_count+=1
- neg_total+=idx
- if len(txt)>10:
- if domain is not None:
- random.shuffle(domain)
- for d in domain:
- if d in href:
- print('found....')
- print('clicked....')
- print(href)
- print(txt)
- print("ranking", idx)
- found=True
- clickelmt=elmt
- clickidx=idx
- clickhref=href
- clicktitle=txt
- else:
- if exclude not in href:
- test_lst.append(elmt)
-
- idx+=1
- if exclude is not None:
- print('exclude')
- elmt=random.choice(test_lst)
- print(elmt)
- webdriver.ActionChains(driver).move_to_element(elmt).perform()
- webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
- time.sleep(5)
- if neg_count ==0:
- negstr='0'
- else:
- negstr=str(neg_total/neg_count)
- print(' negative: ' +negstr)
- if neg_total > 0:
- print('negative.....')
- if not found:
- True
- print('not found')
- # table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄'})
- else:
- webdriver.ActionChains(driver).move_to_element(clickelmt).perform()
- webdriver.ActionChains(driver).move_to_element(clickelmt).click().perform()
- print('clicked...')
- entry={'cust':jsobj['cust'],'plan':jsobj['plan'],'prefix':'','postfix':'','domain':str(domain),'kw':kw,'positive':str([''])}
- if numresults >=40:
- print(entry)
- table.insert(entry)
- # table.insert({'ranking':clickidx,'kw':kw,'results':numresults,'url':clickhref,'title':clicktitle,'avg_neg':negstr})
- time.sleep(6)
- print('sleep 6')
- return
- except:
- traceback.print_exc()
- print('exception')
- traceback.print_exc()
- driver.quit()
- time.sleep(5)
- #r=random.randint(0,7)
- r=987
- #JNOTE: 關鍵字點擊
- related=''
- cursor=db.query('SELECT id,cust,plan,prefix,domain,kw,positive FROM public.seo_random_test_jobs order by random() limit 1')
- for c in cursor:
- id=c['id']
- cust=c['cust']
- kw=c['kw']
- plan=c['plan']
- prefix=c['prefix']
- domain=eval(c['domain'])
- positive=eval(c['positive'])
- break
- print('delete from seo_random_test_jobs where id='+str(c['id'])+'')
- db.query('delete from seo_random_test_jobs where id='+str(c['id'])+'')
- # positive=['台北室內設計公司排名']
- # positive=[related]
- # positive=['半 日照 植物 推薦']
- # positive=['3 坪 多大']
- # positive=['鞋櫃']
- # positive=['裝修屋子']
- # positive=['']
- # kw='幸福空間'
- # kw='輕裝修'
- # kw='輕裝修'
- #朱英凱
- #琢隱設計
- #os.system('curl --socks5 choozmo:choozmo9@172.104.92.245:14900 http://www.google.com')
- newkw=prefix+" "+kw+' '+random.choice(positive)
- print(newkw)
- #newkw=kw
- run_once({'domain':domain,'kw':newkw,'id':id,'cust':cust,'plan':plan,'positive':positive})
|