import random import sys import dataset from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By import traceback import datetime import codecs import time import urllib import argparse import logging import sys from logging.handlers import SysLogHandler import socket import pandas as pd import socket import os import dataset import pymysql pymysql.install_as_MySQLdb() driver = None def restart_browser(): global driver user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' s = Service('/Users/mac/Downloads/127/chromedriver') options = webdriver.ChromeOptions() options.add_argument('--headless') # options.add_argument('--remote-debugging-port=9222') # options.add_experimental_option("debuggerAddress", "192.168.192.45:9922") options.add_argument("--user-agent=" + user_agent) options.add_argument("--incognito") # options.add_argument('--proxy-server=socks5://172.104.93.163:41800') driver = webdriver.Chrome(options=options, service=s) str1 = driver.capabilities['browserVersion'] driver.delete_all_cookies() driver.set_window_size(1400, 20000) return driver def process_one(): lst=['信義房屋','信義 房屋','信義房仲','信義 房仲'] # lst=["驊揚室內裝修設計","驊揚室內裝修設計 評論"] # lst=['真理大學','真理大學停招','真理大學倒閉','真理大學評價','真理大學倒閉dcard'] # lst = ['百威旅行社','百威旅遊','百威旅行'] # lst = ['信義 房屋'] # lst = ['加百裕工業股份有限公司','加百裕','黃靖容'] # lst = ['富玉珠寶','富玉珠寶有限公司'] # lst = ['台北牙周病醫生推薦 ','微創水雷射','水雷射牙周病'] # lst = ['真理大學 site:ptt.cc','真理大學 site:dcard.tw'] # lst = ['高雄 全 戶 軟水','高雄 淨水器推薦','淨水系統 高雄'] # lst = ['啟翔輕金屬','啟翔','陳百欽'] date='0731' for term in lst: driver=restart_browser() escaped_search_term=urllib.parse.quote(term) # googleurl = 'https://www.google.com/search?q={}&num={}&hl={}&gl=tw'.format(escaped_search_term, 100,'zh-TW') googleurl = 'https://www.google.com/search?q={}&num={}&hl={}&gl=tw&tbm=vid&tbs=vd:m'.format(urllib.parse.quote(term), 100, 'zh-TW') driver.get(googleurl) time.sleep(6) print(driver.current_url) # driver.save_screenshot('/Users/mac/Downloads/'+date+term+'.png') driver.save_screenshot('/Users/mac/Downloads/'+date+term+"_vi"+'.png') df=pd.DataFrame() # elmts=driver.find_elements(By.XPATH,"//div[@class='yuRUbf']//a") elmts=driver.find_elements(By.XPATH,"//div[@class='xe8e1b']//a") print(len(elmts)) cnt=1 datadict={'搜尋詞':[],'結果標題':[],'結果網址':[],'結果名次':[]} for elmt in elmts: try: href=elmt.get_attribute('href') datadict['搜尋詞'].append(term) datadict['結果標題'].append(elmt.text) datadict['結果網址'].append(href) datadict['結果名次'].append(str(cnt)) cnt+=1 except: print('href2 exception') traceback.print_exc() if len(datadict['結果標題'])<=0: print('None') driver.quit() sys.exit() df['搜尋詞']=datadict['搜尋詞'] df['結果標題']=datadict['結果標題'] df['結果網址']=datadict['結果網址'] df['結果名次']=datadict['結果名次'] # df.to_csv('/Users/mac/Downloads/'+date+term+".csv", encoding='utf-8') df.to_csv('/Users/mac/Downloads/'+date+term+"_vi"+".csv") driver.quit() print('等待') time.sleep(random.randint(80,90)) process_one()