Browse Source

上傳檔案到 'crawlers/fb_youtube'

ming 2 years ago
parent
commit
b297649159
1 changed files with 108 additions and 0 deletions
  1. 108 0
      crawlers/fb_youtube/crawl_fb.py

+ 108 - 0
crawlers/fb_youtube/crawl_fb.py

@@ -0,0 +1,108 @@
+from selenium import webdriver
+from selenium.common.exceptions import TimeoutException
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+from time import sleep
+import csv
+#搜尋關鍵字
+keyword = '科技'
+#太多貼文捲不完,設置timeout
+max_scroll_time = 1
+
+def scroll_to_end(cut):
+    SCROLL_PAUSE_TIME = 0.5
+    last_height = driver.execute_script("return document.body.scrollHeight")
+    exe_times = 0
+    while True:
+        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+        sleep(SCROLL_PAUSE_TIME)
+        new_height = driver.execute_script("return document.body.scrollHeight")
+        if new_height == last_height:
+            break
+        exe_times += 1
+        if cut !=0 :
+            if exe_times>cut:
+                break
+        last_height = new_height
+
+def set_profile_path(profilepath):
+    option = webdriver.ChromeOptions()
+    option.add_argument('--disable-web-security') 
+    option.add_argument('--allow-running-insecure-content') 
+    option.add_argument("--user-data-dir="+profilepath+"\\")
+    driver = webdriver.Chrome(options=option)
+    return driver
+
+driver = set_profile_path('C:/Users/ming/AppData/Local/Google/Chrome/User Data/Default')
+
+driver.get('https://www.facebook.com/search/pages/?q='+keyword)
+scroll_to_end(max_scroll_time)
+sleep(1)
+
+clubs_elem = driver.find_elements(By.XPATH,"//div[@class='j83agx80 l9j0dhe7 k4urcfbm']")
+club_urls = []
+club_thumb = []
+for e in clubs_elem:
+    
+    url = e.find_elements(By.XPATH,".//span[@class='nc684nl6']/a")[0].get_attribute('href')
+    club_urls.append(url)
+    
+
+rows = []
+col_name = ['社團名稱','社團按讚','貼文按讚','貼文內容','日期','分享數','留言數']
+rows.append(col_name)
+thumb_icon_style='background-image: url("https://static.xx.fbcdn.net/rsrc.php/v3/yP/r/-yo1T7mJE5M.png"); background-position: 0px -618px; background-size: 26px 1376px; width: 20px; height: 20px; background-repeat: no-repeat; display: inline-block;'
+for url in club_urls:
+    driver.get(url)
+    #for page loading
+    sleep(5)
+    club_name = driver.find_element(By.XPATH,"//div[@class='tr9rh885']//h2/span/span").text
+    thumbs = driver.find_element(By.XPATH,"//div[@class='taijpn5t cbu4d94t j83agx80']//span/span").text
+    scroll_to_end(max_scroll_time)
+    #過長貼文需要展開
+    showmores = driver.find_elements(By.XPATH,".//*[contains(text(), '顯示更多')]")
+    for show_btn in showmores:
+        try:
+            show_btn.click()
+        except:
+            pass
+    
+    post_elems = driver.find_elements(By.XPATH,"//div[@class='du4w35lb k4urcfbm l9j0dhe7 sjgh65i0']")
+    for p_elem in post_elems:
+        try:
+            content = p_elem.find_element(By.XPATH,".//div[@class='ecm0bbzt hv4rvrfc ihqw7lf3 dati1w0a']").text.rstrip()
+            time = p_elem.find_element(By.XPATH,".//span[@class='j1lvzwm4 stjgntxs ni8dbmo4 q9uorilb gpro0wi8']/b/b[not(@style='display: none;')]").text
+            post_thumbs = p_elem.find_element(By.XPATH,".//span[@class='gpro0wi8 cwj9ozl2 bzsjyuwj ja2t1vim']").text
+            shares_comments = p_elem.find_element(By.XPATH,".//div[@class='bp9cbjyn j83agx80 pfnyh3mw p1ueia1e']").text.splitlines()
+            shares = ''
+            comments = ''
+            #分享、留言都塞在這div內
+            if len(shares_comments) != 0:
+                for item in shares_comments:
+                    if '分享' in item:
+                        shares = item
+                    else:
+                        comments = item
+            if shares == '':
+                shares=0
+            if comments =='':
+                comments=0
+            #分享頁面沒有貼文內容
+            if content != '':
+                rows.append([club_name,thumbs,post_thumbs,content,time,shares,comments])
+            print(time)
+            print(content)
+            print(shares)
+            print(comments)
+            print(post_thumbs)
+            print('----------------------------------------------------------------')
+        except:
+            pass #Facebook streaming video posts have no content
+
+with open('out.csv', 'w', encoding='UTF8') as f:
+    writer = csv.writer(f)
+    for r in rows:
+        writer.writerow(r)
+    
+