123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107 |
- import random
- import sys
- import dataset
- from selenium import webdriver
- from selenium.webdriver.chrome.service import Service
- from selenium.webdriver.common.by import By
- import traceback
- import datetime
- import codecs
- import time
- import urllib
- import argparse
- import logging
- import sys
- from logging.handlers import SysLogHandler
- import socket
- import pandas as pd
- import socket
- import os
- import dataset
- import pymysql
- pymysql.install_as_MySQLdb()
- driver = None
- def restart_browser():
- global driver
- user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
- s = Service('C:\/Users\/s1301\/Downloads\/chromedriver_116\/chromedriver-win32\/chromedriver')
- options = webdriver.ChromeOptions()
- options.add_argument('--headless')
- # options.add_argument('--remote-debugging-port=9222')
- # options.add_experimental_option("debuggerAddress", "192.168.192.45:9922")
- options.add_argument("--user-agent=" + user_agent)
- # options.add_argument("--incognito")
- # options.add_argument('--proxy-server=socks5://172.104.93.163:41800')
- driver = webdriver.Chrome(options=options, service=s)
- str1 = driver.capabilities['browserVersion']
- driver.delete_all_cookies()
- driver.set_window_size(1400, 20000)
- return driver
- def process_one():
- # lst=['信義房屋','信義 房屋','信義房仲','信義 房仲']
- # lst=["親子育兒交流 奶爸","產後瘦身益生菌","產後瘦身推薦"]
- lst=['雙響泡','双響泡','雙響砲']
- # lst = ['小羊水果']
- # lst = ['巨力搬家']
- # lst = ['台中搬家推薦','桃園搬家推薦','新竹搬家推薦']
- # lst = ['地面課程','出國學飛','自訓學飛']
- # lst = ['台北牙周病醫生推薦 ','微創水雷射','水雷射牙周病']
- # lst = ['地面課程','出國學飛','自訓學飛']
- # lst = ['樂活眼鏡評價','樂活眼鏡']
- # lst = ['新竹搬家推薦','台中搬家推薦','桃園搬家推薦']
- # lst = ['芙樂思蜂王漿乳霜','蜂王漿乳霜']
- # lst = ['涼夏清爽','夏季口罩','石墨烯','透氣 口罩','夏天口罩推薦','夏天立體口罩']
- # lst = ['啟翔輕金屬','啟翔','陳百欽']
- date='0907'
- for term in lst:
- driver=restart_browser()
- escaped_search_term=urllib.parse.quote(term)
- googleurl = 'https://www.google.com/search?q={}&num={}&hl={}&gl=tw'.format(escaped_search_term, 100,'zh-TW')
- # googleurl='https://www.google.com/search?q=%E9%9B%99%E9%9F%BF%E7%A0%B2&num=100&rlz=1C1ONGR_zh-TWTW1033TW1033&oq=%E9%9B%99%E9%9F%BF%E7%A0%B2&aqs=chrome..69i57j69i61.414j0j1&sourceid=chrome&ie=UTF-8#ip=1&gl=tw&hl=zh-TW'
- driver.get(googleurl)
- time.sleep(6)
- print(driver.current_url)
- driver.save_screenshot('C:\/Users\/s1301\/OneDrive\/Pictures\/Saved Pictures\/'+date+term+'.png')
- df=pd.DataFrame()
- elmts=driver.find_elements(By.XPATH,"//div[@class='yuRUbf']//a")
- print(len(elmts))
- cnt=1
- datadict={'搜尋詞':[],'結果標題':[],'結果網址':[],'結果名次':[]}
- for elmt in elmts:
- try:
- href=elmt.get_attribute('href')
- datadict['搜尋詞'].append(term)
- datadict['結果標題'].append(elmt.text)
- datadict['結果網址'].append(href)
- datadict['結果名次'].append(str(cnt))
- cnt+=1
- except:
- print('href2 exception')
- traceback.print_exc()
- if len(datadict['結果標題'])<=0:
- print('None')
- driver.quit()
- sys.exit()
- df['搜尋詞']=datadict['搜尋詞']
- df['結果標題']=datadict['結果標題']
- df['結果網址']=datadict['結果網址']
- df['結果名次']=datadict['結果名次']
- df.to_excel('C:\/Users\/s1301\/OneDrive\/Pictures\/Saved Pictures\/'+date+term+".xls")
- driver.quit()
- print('等待')
- time.sleep(random.randint(100,120))
- process_one()
|