crawl_fb.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. from selenium import webdriver
  2. from selenium.common.exceptions import TimeoutException
  3. from selenium.webdriver.support.ui import WebDriverWait
  4. from selenium.webdriver.support import expected_conditions as EC
  5. from selenium.webdriver.common.by import By
  6. from time import sleep
  7. import csv
  8. #搜尋關鍵字
  9. keyword = '科技'
  10. #太多貼文捲不完,設置timeout
  11. max_scroll_time = 1
  12. def scroll_to_end(cut):
  13. SCROLL_PAUSE_TIME = 0.5
  14. last_height = driver.execute_script("return document.body.scrollHeight")
  15. exe_times = 0
  16. while True:
  17. driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  18. sleep(SCROLL_PAUSE_TIME)
  19. new_height = driver.execute_script("return document.body.scrollHeight")
  20. if new_height == last_height:
  21. break
  22. exe_times += 1
  23. if cut !=0 :
  24. if exe_times>cut:
  25. break
  26. last_height = new_height
  27. def set_profile_path(profilepath):
  28. option = webdriver.ChromeOptions()
  29. option.add_argument('--disable-web-security')
  30. option.add_argument('--allow-running-insecure-content')
  31. option.add_argument("--user-data-dir="+profilepath+"\\")
  32. driver = webdriver.Chrome(options=option)
  33. return driver
  34. driver = set_profile_path('C:/Users/ming/AppData/Local/Google/Chrome/User Data/Default')
  35. driver.get('https://www.facebook.com/search/pages/?q='+keyword)
  36. scroll_to_end(max_scroll_time)
  37. sleep(1)
  38. clubs_elem = driver.find_elements(By.XPATH,"//div[@class='j83agx80 l9j0dhe7 k4urcfbm']")
  39. club_urls = []
  40. club_thumb = []
  41. for e in clubs_elem:
  42. url = e.find_elements(By.XPATH,".//span[@class='nc684nl6']/a")[0].get_attribute('href')
  43. club_urls.append(url)
  44. rows = []
  45. col_name = ['社團名稱','社團按讚','貼文按讚','貼文內容','日期','分享數','留言數']
  46. rows.append(col_name)
  47. thumb_icon_style='background-image: url("https://static.xx.fbcdn.net/rsrc.php/v3/yP/r/-yo1T7mJE5M.png"); background-position: 0px -618px; background-size: 26px 1376px; width: 20px; height: 20px; background-repeat: no-repeat; display: inline-block;'
  48. for url in club_urls:
  49. driver.get(url)
  50. #for page loading
  51. sleep(5)
  52. club_name = driver.find_element(By.XPATH,"//div[@class='tr9rh885']//h2/span/span").text
  53. thumbs = driver.find_element(By.XPATH,"//div[@class='taijpn5t cbu4d94t j83agx80']//span/span").text
  54. scroll_to_end(max_scroll_time)
  55. #過長貼文需要展開
  56. showmores = driver.find_elements(By.XPATH,".//*[contains(text(), '顯示更多')]")
  57. for show_btn in showmores:
  58. try:
  59. show_btn.click()
  60. except:
  61. pass
  62. post_elems = driver.find_elements(By.XPATH,"//div[@class='du4w35lb k4urcfbm l9j0dhe7 sjgh65i0']")
  63. for p_elem in post_elems:
  64. try:
  65. content = p_elem.find_element(By.XPATH,".//div[@class='ecm0bbzt hv4rvrfc ihqw7lf3 dati1w0a']").text.rstrip()
  66. time = p_elem.find_element(By.XPATH,".//span[@class='j1lvzwm4 stjgntxs ni8dbmo4 q9uorilb gpro0wi8']/b/b[not(@style='display: none;')]").text
  67. post_thumbs = p_elem.find_element(By.XPATH,".//span[@class='gpro0wi8 cwj9ozl2 bzsjyuwj ja2t1vim']").text
  68. shares_comments = p_elem.find_element(By.XPATH,".//div[@class='bp9cbjyn j83agx80 pfnyh3mw p1ueia1e']").text.splitlines()
  69. shares = ''
  70. comments = ''
  71. #分享、留言都塞在這div內
  72. if len(shares_comments) != 0:
  73. for item in shares_comments:
  74. if '分享' in item:
  75. shares = item
  76. else:
  77. comments = item
  78. if shares == '':
  79. shares=0
  80. if comments =='':
  81. comments=0
  82. #分享頁面沒有貼文內容
  83. if content != '':
  84. rows.append([club_name,thumbs,post_thumbs,content,time,shares,comments])
  85. print(time)
  86. print(content)
  87. print(shares)
  88. print(comments)
  89. print(post_thumbs)
  90. print('----------------------------------------------------------------')
  91. except:
  92. pass #Facebook streaming video posts have no content
  93. with open('out.csv', 'w', encoding='UTF8') as f:
  94. writer = csv.writer(f)
  95. for r in rows:
  96. writer.writerow(r)