from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from time import sleep import csv #搜尋關鍵字 keyword = '科技' #太多貼文捲不完,設置timeout max_scroll_time = 1 def scroll_to_end(cut): SCROLL_PAUSE_TIME = 0.5 last_height = driver.execute_script("return document.body.scrollHeight") exe_times = 0 while True: driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") sleep(SCROLL_PAUSE_TIME) new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break exe_times += 1 if cut !=0 : if exe_times>cut: break last_height = new_height def set_profile_path(profilepath): option = webdriver.ChromeOptions() option.add_argument('--disable-web-security') option.add_argument('--allow-running-insecure-content') option.add_argument("--user-data-dir="+profilepath+"\\") driver = webdriver.Chrome(options=option) return driver driver = set_profile_path('C:/Users/ming/AppData/Local/Google/Chrome/User Data/Default') driver.get('https://www.facebook.com/search/pages/?q='+keyword) scroll_to_end(max_scroll_time) sleep(1) clubs_elem = driver.find_elements(By.XPATH,"//div[@class='j83agx80 l9j0dhe7 k4urcfbm']") club_urls = [] club_thumb = [] for e in clubs_elem: url = e.find_elements(By.XPATH,".//span[@class='nc684nl6']/a")[0].get_attribute('href') club_urls.append(url) rows = [] col_name = ['社團名稱','社團按讚','貼文按讚','貼文內容','日期','分享數','留言數'] rows.append(col_name) thumb_icon_style='background-image: url("https://static.xx.fbcdn.net/rsrc.php/v3/yP/r/-yo1T7mJE5M.png"); background-position: 0px -618px; background-size: 26px 1376px; width: 20px; height: 20px; background-repeat: no-repeat; display: inline-block;' for url in club_urls: driver.get(url) #for page loading sleep(5) club_name = driver.find_element(By.XPATH,"//div[@class='tr9rh885']//h2/span/span").text thumbs = driver.find_element(By.XPATH,"//div[@class='taijpn5t cbu4d94t j83agx80']//span/span").text scroll_to_end(max_scroll_time) #過長貼文需要展開 showmores = driver.find_elements(By.XPATH,".//*[contains(text(), '顯示更多')]") for show_btn in showmores: try: show_btn.click() except: pass post_elems = driver.find_elements(By.XPATH,"//div[@class='du4w35lb k4urcfbm l9j0dhe7 sjgh65i0']") for p_elem in post_elems: try: content = p_elem.find_element(By.XPATH,".//div[@class='ecm0bbzt hv4rvrfc ihqw7lf3 dati1w0a']").text.rstrip() time = p_elem.find_element(By.XPATH,".//span[@class='j1lvzwm4 stjgntxs ni8dbmo4 q9uorilb gpro0wi8']/b/b[not(@style='display: none;')]").text post_thumbs = p_elem.find_element(By.XPATH,".//span[@class='gpro0wi8 cwj9ozl2 bzsjyuwj ja2t1vim']").text shares_comments = p_elem.find_element(By.XPATH,".//div[@class='bp9cbjyn j83agx80 pfnyh3mw p1ueia1e']").text.splitlines() shares = '' comments = '' #分享、留言都塞在這div內 if len(shares_comments) != 0: for item in shares_comments: if '分享' in item: shares = item else: comments = item if shares == '': shares=0 if comments =='': comments=0 #分享頁面沒有貼文內容 if content != '': rows.append([club_name,thumbs,post_thumbs,content,time,shares,comments]) print(time) print(content) print(shares) print(comments) print(post_thumbs) print('----------------------------------------------------------------') except: pass #Facebook streaming video posts have no content with open('out.csv', 'w', encoding='UTF8') as f: writer = csv.writer(f) for r in rows: writer.writerow(r)