| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596 | 
							- import traceback
 
- from selenium import webdriver
 
- from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 
- import time
 
- import os
 
- import datetime
 
- import urllib.parse
 
- from selenium.webdriver.support.ui import WebDriverWait
 
- from selenium.webdriver.common.by import By
 
- from selenium.webdriver.support import expected_conditions as EC
 
- import codecs
 
- import random
 
- from bs4 import BeautifulSoup
 
- import requests
 
- import time
 
- # import rpyc
 
- import sys
 
- import docker
 
- # import googlesearch
 
- import codecs
 
- import sys
 
- import time
 
- import dataset
 
- import os
 
- import html2text
 
- def process_one(driver):
 
-     lst=[]
 
-     elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
 
-     for elmt in elmts:
 
-         try:
 
-             href=elmt.get_attribute('href')
 
- #            print(href)
 
-             txt=elmt.text.split('\n')
 
-             print(txt[0])
 
-             lst.append({'title':txt[0],'url':href})
 
-         except:
 
-             print('href2 exception')
 
-             traceback.print_exc()
 
-     return lst
 
- def process_query(driver,qs,number_results=10,language_code='zh-TW',enable_next=True):
 
-     escaped_search_term=urllib.parse.quote(qs)
 
-     googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(escaped_search_term, number_results+1,language_code)
 
-     print(googleurl)
 
-     driver.get(googleurl)
 
-     time.sleep(3)
 
-     totallst=[]
 
-     while True:
 
-         lst=process_one(driver)
 
-         totallst+=lst
 
-         try:
 
-             if enable_next:
 
-                 time.sleep(3)
 
-                 elmt=driver.find_element_by_xpath("//a[@id='pnnext']")
 
-                 webdriver.ActionChains(driver).move_to_element(elmt).perform()
 
-                 webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
 
-             else:
 
-                 break
 
-         except:
 
-             traceback.print_exc()
 
-             print('pnnext exception')
 
-             break
 
-         time.sleep(1.5)
 
-     return totallst
 
- result=[]
 
- driver=None
 
- def restart_browser():
 
- #    os.system('docker container restart p4444')
 
- #    time.sleep(10)
 
-     options = webdriver.ChromeOptions()
 
- #    options.add_argument("--proxy-server=http://80.48.119.28:8080")
 
- #    driver=webdriver.Chrome(executable_path='/Users/zooeytsai/Downloads/chromedriver',options=options)
 
-     driver=webdriver.Chrome(desired_capabilities=options.to_capabilities())
 
-     #driver = webdriver.Remote(
 
-     #    command_executor='http://127.0.0.1:4444/wd/hub',
 
-     #desired_capabilities=options.to_capabilities())
 
- #    desired_capabilities=DesiredCapabilities.CHROME)
 
-     driver.set_window_size(1400,1000)
 
-     return driver
 
- db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
 
- table=db['kw_url_search_result']
 
- driver=restart_browser()
 
- lst=process_query(driver,'班尼斯 site:mobile01.com',number_results=50,language_code='zh-TW',enable_next=False)
 
- for l in lst:
 
-     table.insert(l)
 
- print(lst)
 
- #print(html2text.html2text("<p><strong>Zed's</strong> dead baby, <em>Zed's</em> dead.</p>"))
 
 
  |