123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108 |
- from selenium import webdriver
- from selenium.common.exceptions import TimeoutException
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.common.by import By
- from time import sleep
- import csv
- #搜尋關鍵字
- keyword = '科技'
- #太多貼文捲不完,設置timeout
- max_scroll_time = 1
- def scroll_to_end(cut):
- SCROLL_PAUSE_TIME = 0.5
- last_height = driver.execute_script("return document.body.scrollHeight")
- exe_times = 0
- while True:
- driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
- sleep(SCROLL_PAUSE_TIME)
- new_height = driver.execute_script("return document.body.scrollHeight")
- if new_height == last_height:
- break
- exe_times += 1
- if cut !=0 :
- if exe_times>cut:
- break
- last_height = new_height
- def set_profile_path(profilepath):
- option = webdriver.ChromeOptions()
- option.add_argument('--disable-web-security')
- option.add_argument('--allow-running-insecure-content')
- option.add_argument("--user-data-dir="+profilepath+"\\")
- driver = webdriver.Chrome(options=option)
- return driver
- driver = set_profile_path('C:/Users/ming/AppData/Local/Google/Chrome/User Data/Default')
- driver.get('https://www.facebook.com/search/pages/?q='+keyword)
- scroll_to_end(max_scroll_time)
- sleep(1)
- clubs_elem = driver.find_elements(By.XPATH,"//div[@class='j83agx80 l9j0dhe7 k4urcfbm']")
- club_urls = []
- club_thumb = []
- for e in clubs_elem:
-
- url = e.find_elements(By.XPATH,".//span[@class='nc684nl6']/a")[0].get_attribute('href')
- club_urls.append(url)
-
- rows = []
- col_name = ['社團名稱','社團按讚','貼文按讚','貼文內容','日期','分享數','留言數']
- rows.append(col_name)
- thumb_icon_style='background-image: url("https://static.xx.fbcdn.net/rsrc.php/v3/yP/r/-yo1T7mJE5M.png"); background-position: 0px -618px; background-size: 26px 1376px; width: 20px; height: 20px; background-repeat: no-repeat; display: inline-block;'
- for url in club_urls:
- driver.get(url)
- #for page loading
- sleep(5)
- club_name = driver.find_element(By.XPATH,"//div[@class='tr9rh885']//h2/span/span").text
- thumbs = driver.find_element(By.XPATH,"//div[@class='taijpn5t cbu4d94t j83agx80']//span/span").text
- scroll_to_end(max_scroll_time)
- #過長貼文需要展開
- showmores = driver.find_elements(By.XPATH,".//*[contains(text(), '顯示更多')]")
- for show_btn in showmores:
- try:
- show_btn.click()
- except:
- pass
-
- post_elems = driver.find_elements(By.XPATH,"//div[@class='du4w35lb k4urcfbm l9j0dhe7 sjgh65i0']")
- for p_elem in post_elems:
- try:
- content = p_elem.find_element(By.XPATH,".//div[@class='ecm0bbzt hv4rvrfc ihqw7lf3 dati1w0a']").text.rstrip()
- time = p_elem.find_element(By.XPATH,".//span[@class='j1lvzwm4 stjgntxs ni8dbmo4 q9uorilb gpro0wi8']/b/b[not(@style='display: none;')]").text
- post_thumbs = p_elem.find_element(By.XPATH,".//span[@class='gpro0wi8 cwj9ozl2 bzsjyuwj ja2t1vim']").text
- shares_comments = p_elem.find_element(By.XPATH,".//div[@class='bp9cbjyn j83agx80 pfnyh3mw p1ueia1e']").text.splitlines()
- shares = ''
- comments = ''
- #分享、留言都塞在這div內
- if len(shares_comments) != 0:
- for item in shares_comments:
- if '分享' in item:
- shares = item
- else:
- comments = item
- if shares == '':
- shares=0
- if comments =='':
- comments=0
- #分享頁面沒有貼文內容
- if content != '':
- rows.append([club_name,thumbs,post_thumbs,content,time,shares,comments])
- print(time)
- print(content)
- print(shares)
- print(comments)
- print(post_thumbs)
- print('----------------------------------------------------------------')
- except:
- pass #Facebook streaming video posts have no content
- with open('out.csv', 'w', encoding='UTF8') as f:
- writer = csv.writer(f)
- for r in rows:
- writer.writerow(r)
-
-
|